511 files changed, 20392 insertions, 10255 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index a0d479d1e1d..f66f4df1869 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
       all the readers who were traversing the list when we deleted the
       element are finished.  We use <function>call_rcu()</function> to
       register a callback which will actually destroy the object once
-      the readers are finished.
+      all pre-existing readers are finished.  Alternatively,
+      <function>synchronize_rcu()</function> may be used to block until
+      all pre-existing are finished.
     </para>
     <para>
       But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
 -        object_put(obj);
 +        list_del_rcu(&amp;obj-&gt;list);
          cache_num--;
-+        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj);
++        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu);
  }
 
  /* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
          if (++cache_num > MAX_CACHE_SIZE) {
                  struct object *i, *outcast = NULL;
                  list_for_each_entry(i, &amp;cache, list) {
-@@ -85,6 +94,7 @@
-         obj-&gt;popularity = 0;
-         atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
-         spin_lock_init(&amp;obj-&gt;lock);
-+        INIT_RCU_HEAD(&amp;obj-&gt;rcu);
-
-         spin_lock_irqsave(&amp;cache_lock, flags);
-         __cache_add(obj);
 @@ -104,12 +114,11 @@
  struct object *cache_find(int id)
  {
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 790d1a81237..0c134f8afc6 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
 	include:
 
 	a.	Keeping a count of the number of data-structure elements
-		used by the RCU-protected data structure, including those
-		waiting for a grace period to elapse.  Enforce a limit
-		on this number, stalling updates as needed to allow
-		previously deferred frees to complete.
-
-		Alternatively, limit only the number awaiting deferred
-		free rather than the total number of elements.
+		used by the RCU-protected data structure, including
+		those waiting for a grace period to elapse.  Enforce a
+		limit on this number, stalling updates as needed to allow
+		previously deferred frees to complete.	Alternatively,
+		limit only the number awaiting deferred free rather than
+		the total number of elements.
+
+		One way to stall the updates is to acquire the update-side
+		mutex.	(Don't try this with a spinlock -- other CPUs
+		spinning on the lock could prevent the grace period
+		from ever ending.)  Another way to stall the updates
+		is for the updates to use a wrapper function around
+		the memory allocator, so that this wrapper function
+		simulates OOM when there is too much memory awaiting an
+		RCU grace period.  There are of course many other
+		variations on this theme.
 
 	b.	Limiting update rate.  For example, if updates occur only
 		once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
 	and the compiler to freely reorder code into and out of RCU
 	read-side critical sections.  It is the responsibility of the
 	RCU update-side primitives to deal with this.
+
+17.	Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
+	the __rcu sparse checks to validate your RCU code.  These
+	can help find problems as follows:
+
+	CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+		structures are carried out under the proper RCU
+		read-side critical section, while holding the right
+		combination of locks, or whatever other conditions
+		are appropriate.
+
+	CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+		same object to call_rcu() (or friends) before an RCU
+		grace period has elapsed since the last time that you
+		passed that same object to call_rcu() (or friends).
+
+	__rcu sparse checks: tag the pointer to the RCU-protected data
+		structure with __rcu, and sparse will warn you if you
+		access that pointer without the services of one of the
+		variants of rcu_dereference().
+
+	These debugging aids can help you find problems that are
+	otherwise extremely difficult to spot.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 44c6dcc93d6..862c08ef1fd 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -80,6 +80,24 @@ o	A CPU looping with bottom halves disabled.  This condition can
 o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
 	without invoking schedule().
 
+o	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+	happen to preempt a low-priority task in the middle of an RCU
+	read-side critical section.   This is especially damaging if
+	that low-priority task is not permitted to run on any other CPU,
+	in which case the next RCU grace period can never complete, which
+	will eventually cause the system to run out of memory and hang.
+	While the system is in the process of running itself out of
+	memory, you might see stall-warning messages.
+
+o	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+	is running at a higher priority than the RCU softirq threads.
+	This will prevent RCU callbacks from ever being invoked,
+	and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
+	RCU grace periods from ever completing.  Either way, the
+	system will eventually run out of memory and hang.  In the
+	CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
+	messages.
+
 o	A bug in the RCU implementation.
 
 o	A hardware failure.  This is quite unlikely, but has occurred
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index efd8cc95c06..a851118775d 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -125,6 +125,17 @@ o	"b" is the batch limit for this CPU.  If more than this number
 	of RCU callbacks is ready to invoke, then the remainder will
 	be deferred.
 
+o	"ci" is the number of RCU callbacks that have been invoked for
+	this CPU.  Note that ci+ql is the number of callbacks that have
+	been registered in absence of CPU-hotplug activity.
+
+o	"co" is the number of RCU callbacks that have been orphaned due to
+	this CPU going offline.
+
+o	"ca" is the number of RCU callbacks that have been adopted due to
+	other CPUs going offline.  Note that ci+co-ca+ql is the number of
+	RCU callbacks registered on this CPU.
+
 There is also an rcu/rcudata.csv file with the same information in
 comma-separated-variable spreadsheet format.
 
@@ -180,7 +191,7 @@ o	"s" is the "signaled" state that drives force_quiescent_state()'s
 
 o	"jfq" is the number of jiffies remaining for this grace period
 	before force_quiescent_state() is invoked to help push things
-	along.  Note that CPUs in dyntick-idle mode thoughout the grace
+	along.  Note that CPUs in dyntick-idle mode throughout the grace
 	period will not report on their own, but rather must be check by
 	some other CPU via force_quiescent_state().
 
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index f1c5c4bccd3..902d3151f52 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,25 +14,39 @@ to /proc/cpuinfo.
 	identifier (rather than the kernel's).  The actual value is
 	architecture and platform dependent.
 
-3) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
+3) /sys/devices/system/cpu/cpuX/topology/book_id:
+
+	the book ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
 
 	internel kernel map of cpuX's hardware threads within the same
 	core as cpuX
 
-4) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
 
 	internal kernel map of cpuX's hardware threads within the same
 	physical_package_id.
 
+6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	book_id.
+
 To implement it in an architecture-neutral way, a new source file,
-drivers/base/topology.c, is to export the 4 attributes.
+drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
+related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
 
 For an architecture to support this feature, it must define some of
 these macros in include/asm-XXX/topology.h:
 #define topology_physical_package_id(cpu)
 #define topology_core_id(cpu)
+#define topology_book_id(cpu)
 #define topology_thread_cpumask(cpu)
 #define topology_core_cpumask(cpu)
+#define topology_book_cpumask(cpu)
 
 The type of **_id is int.
 The type of siblings is (const) struct cpumask *.
@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
 3) thread_siblings: just the given CPU
 4) core_siblings: just the given CPU
 
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+
 Additionally, CPU topology information is provided under
 /sys/devices/system/cpu and includes these files.  The internal
 source for the output is in brackets ("[]").
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 842aa9de84a..5e2bc4ab897 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -386,34 +386,6 @@ Who:	Tejun Heo <tj@kernel.org>
 
 ----------------------------
 
-What:	Support for VMware's guest paravirtuliazation technique [VMI] will be
-	dropped.
-When:	2.6.37 or earlier.
-Why:	With the recent innovations in CPU hardware acceleration technologies
-	from Intel and AMD, VMware ran a few experiments to compare these
-	techniques to guest paravirtualization technique on VMware's platform.
-	These hardware assisted virtualization techniques have outperformed the
-	performance benefits provided by VMI in most of the workloads. VMware
-	expects that these hardware features will be ubiquitous in a couple of
-	years, as a result, VMware has started a phased retirement of this
-	feature from the hypervisor. We will be removing this feature from the
-	Kernel too. Right now we are targeting 2.6.37 but can retire earlier if
-	technical reasons (read opportunity to remove major chunk of pvops)
-	arise.
-
-	Please note that VMI has always been an optimization and non-VMI kernels
-	still work fine on VMware's platform.
-	Latest versions of VMware's product which support VMI are,
-	Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence
-	releases for these products will continue supporting VMI.
-
-	For more details about VMI retirement take a look at this,
-	http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html
-
-Who:	Alok N Kataria <akataria@vmware.com>
-
-----------------------------
-
 What:	Support for lcd_switch and display_get in asus-laptop driver
 When:	March 2010
 Why:	These two features use non-standard interfaces. There are the
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8dd7248508a..3a0009e03d1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -455,7 +455,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			[ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
 				pxa_timer,timer3,32k_counter,timer0_1
 			[AVR32] avr32
-			[X86-32] pit,hpet,tsc,vmi-timer;
+			[X86-32] pit,hpet,tsc;
 				scx200_hrt on Geode; cyclone on IBM x440
 			[MIPS] MIPS
 			[PARISC] cr16
@@ -2153,6 +2153,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			Reserves a hole at the top of the kernel virtual
 			address space.
 
+	reservelow=	[X86]
+			Format: nn[K]
+			Set the amount of memory to reserve for BIOS at
+			the bottom of the address space.
+
 	reset_devices	[KNL] Force drivers to reset the underlying device
 			during initialization.
 
@@ -2435,6 +2440,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			disables clocksource verification at runtime.
 			Used to enable high-resolution timer mode on older
 			hardware, and in virtualized environment.
+			[x86] noirqtime: Do not use TSC to do irq accounting.
+			Used to run time disable IRQ_TIME_ACCOUNTING on any
+			platforms where RDTSC is slow and this accounting
+			can add overhead.
 
 	turbografx.map[2|3]=	[HW,JOY]
 			TurboGraFX parallel port interface
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 1762b81fcdf..741fe66d6ec 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -542,9 +542,11 @@ Kprobes does not use mutexes or allocate memory except during
 registration and unregistration.
 
 Probe handlers are run with preemption disabled.  Depending on the
-architecture, handlers may also run with interrupts disabled.  In any
-case, your handler should not yield the CPU (e.g., by attempting to
-acquire a semaphore).
+architecture and optimization state, handlers may also run with
+interrupts disabled (e.g., kretprobe handlers and optimized kprobe
+handlers run without interrupt disabled on x86/x86-64).  In any case,
+your handler should not yield the CPU (e.g., by attempting to acquire
+a semaphore).
 
 Since a return probe is implemented by replacing the return
 address with the trampoline's address, stack backtraces and calls
diff --git a/MAINTAINERS b/MAINTAINERS
index f2a2b8e647c..3d4179fbc52 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
 S:	Supported
 F:	Documentation/filesystems/ceph.txt
 F:	fs/ceph
+F:	net/ceph
+F:	include/linux/ceph
 
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 M:	David Vrabel <david.vrabel@csr.com>
@@ -4805,6 +4807,15 @@ F:	fs/qnx4/
 F:	include/linux/qnx4_fs.h
 F:	include/linux/qnxtypes.h
 
+RADOS BLOCK DEVICE (RBD)
+F:	include/linux/qnxtypes.h
+M:	Yehuda Sadeh <yehuda@hq.newdream.net>
+M:	Sage Weil <sage@newdream.net>
+M:	ceph-devel@vger.kernel.org
+S:	Supported
+F:	drivers/block/rbd.c
+F:	drivers/block/rbd_types.h
+
 RADEON FRAMEBUFFER DISPLAY DRIVER
 M:	Benjamin Herrenschmidt <benh@kernel.crashing.org>
 L:	linux-fbdev@vger.kernel.org
diff --git a/Makefile b/Makefile
index 860c26af52c..d3c10719bbb 100644
--- a/Makefile
+++ b/Makefile
@@ -568,6 +568,12 @@ endif
 
 ifdef CONFIG_FUNCTION_TRACER
 KBUILD_CFLAGS	+= -pg
+ifdef CONFIG_DYNAMIC_FTRACE
+	ifdef CONFIG_HAVE_C_RECORDMCOUNT
+		BUILD_C_RECORDMCOUNT := y
+		export BUILD_C_RECORDMCOUNT
+	endif
+endif
 endif
 
 # We trigger additional mismatches with less inlining
@@ -591,6 +597,11 @@ KBUILD_CFLAGS	+= $(call cc-option,-fno-strict-overflow)
 # conserve stack if available
 KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
 
+# check for 'asm goto'
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
+	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+endif
+
 # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
 # But warn user when we do so
 warn-assign = \
diff --git a/arch/Kconfig b/arch/Kconfig
index fe48fc7a3eb..53d7f619a1b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -158,4 +158,7 @@ config HAVE_PERF_EVENTS_NMI
 	  subsystem.  Also has support for calculating CPU cycle events
 	  to determine how many clock cycles in a given period.
 
+config HAVE_ARCH_JUMP_LABEL
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b9647bb66d1..d04ccd73af4 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -9,6 +9,7 @@ config ALPHA
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_DMA_ATTRS
 	help
diff --git a/arch/alpha/include/asm/perf_event.h b/arch/alpha/include/asm/perf_event.h
index 4157cd3c44a..fe792ca818f 100644
--- a/arch/alpha/include/asm/perf_event.h
+++ b/arch/alpha/include/asm/perf_event.h
@@ -1,11 +1,6 @@
 #ifndef __ASM_ALPHA_PERF_EVENT_H
 #define __ASM_ALPHA_PERF_EVENT_H
 
-/* Alpha only supports software events through this interface. */
-extern void set_perf_event_pending(void);
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
 #ifdef CONFIG_PERF_EVENTS
 extern void init_hw_perf_events(void);
 #else
diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 85d8e4f58c8..1cc49683fb6 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -307,7 +307,7 @@ again:
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
-	delta = (new_raw_count  - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
+	delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
 
 	/* It is possible on very rare occasions that the PMC has overflowed
 	 * but the interrupt is yet to come.  Detect and fix this situation.
@@ -402,14 +402,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
 		struct hw_perf_event *hwc = &pe->hw;
 		int idx = hwc->idx;
 
-		if (cpuc->current_idx[j] != PMC_NO_INDEX) {
-			cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
-			continue;
+		if (cpuc->current_idx[j] == PMC_NO_INDEX) {
+			alpha_perf_event_set_period(pe, hwc, idx);
+			cpuc->current_idx[j] = idx;
 		}
 
-		alpha_perf_event_set_period(pe, hwc, idx);
-		cpuc->current_idx[j] = idx;
-		cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
+		if (!(hwc->state & PERF_HES_STOPPED))
+			cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
 	}
 	cpuc->config = cpuc->event[0]->hw.config_base;
 }
@@ -420,12 +419,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
  *  - this function is called from outside this module via the pmu struct
  *    returned from perf event initialisation.
  */
-static int alpha_pmu_enable(struct perf_event *event)
+static int alpha_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
 	int n0;
 	int ret;
-	unsigned long flags;
+	unsigned long irq_flags;
 
 	/*
 	 * The Sparc code has the IRQ disable first followed by the perf
@@ -435,8 +435,8 @@ static int alpha_pmu_enable(struct perf_event *event)
 	 * nevertheless we disable the PMCs first to enable a potential
 	 * final PMI to occur before we disable interrupts.
 	 */
-	perf_disable();
-	local_irq_save(flags);
+	perf_pmu_disable(event->pmu);
+	local_irq_save(irq_flags);
 
 	/* Default to error to be returned */
 	ret = -EAGAIN;
@@ -455,8 +455,12 @@ static int alpha_pmu_enable(struct perf_event *event)
 		}
 	}
 
-	local_irq_restore(flags);
-	perf_enable();
+	hwc->state = PERF_HES_UPTODATE;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_STOPPED;
+
+	local_irq_restore(irq_flags);
+	perf_pmu_enable(event->pmu);
 
 	return ret;
 }
@@ -467,15 +471,15 @@ static int alpha_pmu_enable(struct perf_event *event)
  *  - this function is called from outside this module via the pmu struct
  *    returned from perf event initialisation.
  */
-static void alpha_pmu_disable(struct perf_event *event)
+static void alpha_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	unsigned long flags;
+	unsigned long irq_flags;
 	int j;
 
-	perf_disable();
-	local_irq_save(flags);
+	perf_pmu_disable(event->pmu);
+	local_irq_save(irq_flags);
 
 	for (j = 0; j < cpuc->n_events; j++) {
 		if (event == cpuc->event[j]) {
@@ -501,8 +505,8 @@ static void alpha_pmu_disable(struct perf_event *event)
 		}
 	}
 
-	local_irq_restore(flags);
-	perf_enable();
+	local_irq_restore(irq_flags);
+	perf_pmu_enable(event->pmu);
 }
 
 
@@ -514,13 +518,44 @@ static void alpha_pmu_read(struct perf_event *event)
 }
 
 
-static void alpha_pmu_unthrottle(struct perf_event *event)
+static void alpha_pmu_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		cpuc->idx_mask &= ~(1UL<<hwc->idx);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		alpha_perf_event_update(event, hwc, hwc->idx, 0);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+
+	if (cpuc->enabled)
+		wrperfmon(PERFMON_CMD_DISABLE, (1UL<<hwc->idx));
+}
+
+
+static void alpha_pmu_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+		alpha_perf_event_set_period(event, hwc, hwc->idx);
+	}
+
+	hwc->state = 0;
+
 	cpuc->idx_mask |= 1UL<<hwc->idx;
-	wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
+	if (cpuc->enabled)
+		wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
 }
 
 
@@ -642,39 +677,36 @@ static int __hw_perf_event_init(struct perf_event *event)
 	return 0;
 }
 
-static const struct pmu pmu = {
-	.enable		= alpha_pmu_enable,
-	.disable	= alpha_pmu_disable,
-	.read		= alpha_pmu_read,
-	.unthrottle	= alpha_pmu_unthrottle,
-};
-
-
 /*
  * Main entry point to initialise a HW performance event.
  */
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int alpha_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
 	if (!alpha_pmu)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	/* Do the real initialisation work. */
 	err = __hw_perf_event_init(event);
 
-	if (err)
-		return ERR_PTR(err);
-
-	return &pmu;
+	return err;
 }
 
-
-
 /*
  * Main entry point - enable HW performance counters.
  */
-void hw_perf_enable(void)
+static void alpha_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -700,7 +732,7 @@ void hw_perf_enable(void)
  * Main entry point - disable HW performance counters.
  */
 
-void hw_perf_disable(void)
+static void alpha_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -713,6 +745,17 @@ void hw_perf_disable(void)
 	wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
 }
 
+static struct pmu pmu = {
+	.pmu_enable	= alpha_pmu_enable,
+	.pmu_disable	= alpha_pmu_disable,
+	.event_init	= alpha_pmu_event_init,
+	.add		= alpha_pmu_add,
+	.del		= alpha_pmu_del,
+	.start		= alpha_pmu_start,
+	.stop		= alpha_pmu_stop,
+	.read		= alpha_pmu_read,
+};
+
 
 /*
  * Main entry point - don't know when this is called but it
@@ -766,7 +809,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
 	wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
 
 	/* la_ptr is the counter that overflowed. */
-	if (unlikely(la_ptr >= perf_max_events)) {
+	if (unlikely(la_ptr >= alpha_pmu->num_pmcs)) {
 		/* This should never occur! */
 		irq_err_count++;
 		pr_warning("PMI: silly index %ld\n", la_ptr);
@@ -807,7 +850,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
 			/* Interrupts coming too quickly; "throttle" the
 			 * counter, i.e., disable it for a little while.
 			 */
-			cpuc->idx_mask &= ~(1UL<<idx);
+			alpha_pmu_stop(event, 0);
 		}
 	}
 	wrperfmon(PERFMON_CMD_ENABLE, cpuc->idx_mask);
@@ -837,6 +880,7 @@ void __init init_hw_perf_events(void)
 
 	/* And set up PMU specification */
 	alpha_pmu = &ev67_pmu;
-	perf_max_events = alpha_pmu->num_pmcs;
+
+	perf_pmu_register(&pmu);
 }
 
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 396af1799ea..0f1d8493cfc 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -41,7 +41,7 @@
 #include <linux/init.h>
 #include <linux/bcd.h>
 #include <linux/profile.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -83,25 +83,25 @@ static struct {
 
 unsigned long est_cycle_freq;
 
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
 
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
 
-#define set_perf_event_pending_flag()  __get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending()      __get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending()     __get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag()  __get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending()      __get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending()     __get_cpu_var(irq_work_pending) = 0
 
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
 {
-	set_perf_event_pending_flag();
+	set_irq_work_pending_flag();
 }
 
-#else  /* CONFIG_PERF_EVENTS */
+#else  /* CONFIG_IRQ_WORK */
 
-#define test_perf_event_pending()      0
-#define clear_perf_event_pending()
+#define test_irq_work_pending()      0
+#define clear_irq_work_pending()
 
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
 
 
 static inline __u32 rpcc(void)
@@ -191,9 +191,9 @@ irqreturn_t timer_interrupt(int irq, void *dev)
 
 	write_sequnlock(&xtime_lock);
 
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
+	if (test_irq_work_pending()) {
+		clear_irq_work_pending();
+		irq_work_run();
 	}
 
 #ifndef CONFIG_SMP
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9c26ba7244f..9103904b3da 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -23,6 +23,7 @@ config ARM
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index b5799a3b711..c4aa4e8c6af 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts disabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
 /* ARM performance counters start from 1 (in the cp15 accesses) so use the
  * same indexes here for consistency. */
 #define PERF_EVENT_INDEX_OFFSET 1
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index ecbb0288e5d..49643b1467e 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -123,6 +123,12 @@ armpmu_get_max_events(void)
 }
 EXPORT_SYMBOL_GPL(armpmu_get_max_events);
 
+int perf_num_counters(void)
+{
+	return armpmu_get_max_events();
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
 #define HW_OP_UNSUPPORTED		0xFFFF
 
 #define C(_x) \
@@ -221,46 +227,56 @@ again:
 }
 
 static void
-armpmu_disable(struct perf_event *event)
+armpmu_read(struct perf_event *event)
 {
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	WARN_ON(idx < 0);
-
-	clear_bit(idx, cpuc->active_mask);
-	armpmu->disable(hwc, idx);
-
-	barrier();
 
-	armpmu_event_update(event, hwc, idx);
-	cpuc->events[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
+	/* Don't read disabled counters! */
+	if (hwc->idx < 0)
+		return;
 
-	perf_event_update_userpage(event);
+	armpmu_event_update(event, hwc, hwc->idx);
 }
 
 static void
-armpmu_read(struct perf_event *event)
+armpmu_stop(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	/* Don't read disabled counters! */
-	if (hwc->idx < 0)
+	if (!armpmu)
 		return;
 
-	armpmu_event_update(event, hwc, hwc->idx);
+	/*
+	 * ARM pmu always has to update the counter, so ignore
+	 * PERF_EF_UPDATE, see comments in armpmu_start().
+	 */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		armpmu->disable(hwc, hwc->idx);
+		barrier(); /* why? */
+		armpmu_event_update(event, hwc, hwc->idx);
+		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	}
 }
 
 static void
-armpmu_unthrottle(struct perf_event *event)
+armpmu_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
+	if (!armpmu)
+		return;
+
+	/*
+	 * ARM pmu always has to reprogram the period, so ignore
+	 * PERF_EF_RELOAD, see the comment below.
+	 */
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
 	/*
 	 * Set the period again. Some counters can't be stopped, so when we
-	 * were throttled we simply disabled the IRQ source and the counter
+	 * were stopped we simply disabled the IRQ source and the counter
 	 * may have been left counting. If we don't do this step then we may
 	 * get an interrupt too soon or *way* too late if the overflow has
 	 * happened since disabling.
@@ -269,14 +285,33 @@ armpmu_unthrottle(struct perf_event *event)
 	armpmu->enable(hwc, hwc->idx);
 }
 
+static void
+armpmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	WARN_ON(idx < 0);
+
+	clear_bit(idx, cpuc->active_mask);
+	armpmu_stop(event, PERF_EF_UPDATE);
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
 static int
-armpmu_enable(struct perf_event *event)
+armpmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	int idx;
 	int err = 0;
 
+	perf_pmu_disable(event->pmu);
+
 	/* If we don't have a space for the counter then finish early. */
 	idx = armpmu->get_event_idx(cpuc, hwc);
 	if (idx < 0) {
@@ -293,25 +328,19 @@ armpmu_enable(struct perf_event *event)
 	cpuc->events[idx] = event;
 	set_bit(idx, cpuc->active_mask);
 
-	/* Set the period for the event. */
-	armpmu_event_set_period(event, hwc, idx);
-
-	/* Enable the event. */
-	armpmu->enable(hwc, idx);
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	if (flags & PERF_EF_START)
+		armpmu_start(event, PERF_EF_RELOAD);
 
 	/* Propagate our changes to the userspace mapping. */
 	perf_event_update_userpage(event);
 
 out:
+	perf_pmu_enable(event->pmu);
 	return err;
 }
 
-static struct pmu pmu = {
-	.enable	    = armpmu_enable,
-	.disable    = armpmu_disable,
-	.unthrottle = armpmu_unthrottle,
-	.read	    = armpmu_read,
-};
+static struct pmu pmu;
 
 static int
 validate_event(struct cpu_hw_events *cpuc,
@@ -491,20 +520,29 @@ __hw_perf_event_init(struct perf_event *event)
 	return err;
 }
 
-const struct pmu *
-hw_perf_event_init(struct perf_event *event)
+static int armpmu_event_init(struct perf_event *event)
 {
 	int err = 0;
 
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
 	if (!armpmu)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	event->destroy = hw_perf_event_destroy;
 
 	if (!atomic_inc_not_zero(&active_events)) {
-		if (atomic_read(&active_events) > perf_max_events) {
+		if (atomic_read(&active_events) > armpmu->num_events) {
 			atomic_dec(&active_events);
-			return ERR_PTR(-ENOSPC);
+			return -ENOSPC;
 		}
 
 		mutex_lock(&pmu_reserve_mutex);
@@ -518,17 +556,16 @@ hw_perf_event_init(struct perf_event *event)
 	}
 
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	err = __hw_perf_event_init(event);
 	if (err)
 		hw_perf_event_destroy(event);
 
-	return err ? ERR_PTR(err) : &pmu;
+	return err;
 }
 
-void
-hw_perf_enable(void)
+static void armpmu_enable(struct pmu *pmu)
 {
 	/* Enable all of the perf events on hardware. */
 	int idx;
@@ -549,13 +586,23 @@ hw_perf_enable(void)
 	armpmu->start();
 }
 
-void
-hw_perf_disable(void)
+static void armpmu_disable(struct pmu *pmu)
 {
 	if (armpmu)
 		armpmu->stop();
 }
 
+static struct pmu pmu = {
+	.pmu_enable	= armpmu_enable,
+	.pmu_disable	= armpmu_disable,
+	.event_init	= armpmu_event_init,
+	.add		= armpmu_add,
+	.del		= armpmu_del,
+	.start		= armpmu_start,
+	.stop		= armpmu_stop,
+	.read		= armpmu_read,
+};
+
 /*
  * ARMv6 Performance counter handling code.
  *
@@ -1045,7 +1092,7 @@ armv6pmu_handle_irq(int irq_num,
 	 * platforms that can have the PMU interrupts raised as an NMI, this
 	 * will not work.
 	 */
-	perf_event_do_pending();
+	irq_work_run();
 
 	return IRQ_HANDLED;
 }
@@ -2021,7 +2068,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 	 * platforms that can have the PMU interrupts raised as an NMI, this
 	 * will not work.
 	 */
-	perf_event_do_pending();
+	irq_work_run();
 
 	return IRQ_HANDLED;
 }
@@ -2389,7 +2436,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
 			armpmu->disable(hwc, idx);
 	}
 
-	perf_event_do_pending();
+	irq_work_run();
 
 	/*
 	 * Re-enable the PMU.
@@ -2716,7 +2763,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
 			armpmu->disable(hwc, idx);
 	}
 
-	perf_event_do_pending();
+	irq_work_run();
 
 	/*
 	 * Re-enable the PMU.
@@ -2933,14 +2980,12 @@ init_hw_perf_events(void)
 			armpmu = &armv6pmu;
 			memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
 					sizeof(armv6_perf_cache_map));
-			perf_max_events	= armv6pmu.num_events;
 			break;
 		case 0xB020:	/* ARM11mpcore */
 			armpmu = &armv6mpcore_pmu;
 			memcpy(armpmu_perf_cache_map,
 			       armv6mpcore_perf_cache_map,
 			       sizeof(armv6mpcore_perf_cache_map));
-			perf_max_events = armv6mpcore_pmu.num_events;
 			break;
 		case 0xC080:	/* Cortex-A8 */
 			armv7pmu.id = ARM_PERF_PMU_ID_CA8;
@@ -2952,7 +2997,6 @@ init_hw_perf_events(void)
 			/* Reset PMNC and read the nb of CNTx counters
 			    supported */
 			armv7pmu.num_events = armv7_reset_read_pmnc();
-			perf_max_events = armv7pmu.num_events;
 			break;
 		case 0xC090:	/* Cortex-A9 */
 			armv7pmu.id = ARM_PERF_PMU_ID_CA9;
@@ -2964,7 +3008,6 @@ init_hw_perf_events(void)
 			/* Reset PMNC and read the nb of CNTx counters
 			    supported */
 			armv7pmu.num_events = armv7_reset_read_pmnc();
-			perf_max_events = armv7pmu.num_events;
 			break;
 		}
 	/* Intel CPUs [xscale]. */
@@ -2975,13 +3018,11 @@ init_hw_perf_events(void)
 			armpmu = &xscale1pmu;
 			memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
 					sizeof(xscale_perf_cache_map));
-			perf_max_events	= xscale1pmu.num_events;
 			break;
 		case 2:
 			armpmu = &xscale2pmu;
 			memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
 					sizeof(xscale_perf_cache_map));
-			perf_max_events	= xscale2pmu.num_events;
 			break;
 		}
 	}
@@ -2991,9 +3032,10 @@ init_hw_perf_events(void)
 				arm_pmu_names[armpmu->id], armpmu->num_events);
 	} else {
 		pr_info("no hardware support available\n");
-		perf_max_events = -1;
 	}
 
+	perf_pmu_register(&pmu);
+
 	return 0;
 }
 arch_initcall(init_hw_perf_events);
@@ -3001,13 +3043,6 @@ arch_initcall(init_hw_perf_events);
 /*
  * Callchain handling code.
  */
-static inline void
-callchain_store(struct perf_callchain_entry *entry,
-		u64 ip)
-{
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
-		entry->ip[entry->nr++] = ip;
-}
 
 /*
  * The registers we're interested in are at the end of the variable
@@ -3039,7 +3074,7 @@ user_backtrace(struct frame_tail *tail,
 	if (__copy_from_user_inatomic(&buftail, tail, sizeof(buftail)))
 		return NULL;
 
-	callchain_store(entry, buftail.lr);
+	perf_callchain_store(entry, buftail.lr);
 
 	/*
 	 * Frame pointers should strictly progress back up the stack
@@ -3051,16 +3086,11 @@ user_backtrace(struct frame_tail *tail,
 	return buftail.fp - 1;
 }
 
-static void
-perf_callchain_user(struct pt_regs *regs,
-		    struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	struct frame_tail *tail;
 
-	callchain_store(entry, PERF_CONTEXT_USER);
-
-	if (!user_mode(regs))
-		regs = task_pt_regs(current);
 
 	tail = (struct frame_tail *)regs->ARM_fp - 1;
 
@@ -3078,56 +3108,18 @@ callchain_trace(struct stackframe *fr,
 		void *data)
 {
 	struct perf_callchain_entry *entry = data;
-	callchain_store(entry, fr->pc);
+	perf_callchain_store(entry, fr->pc);
 	return 0;
 }
 
-static void
-perf_callchain_kernel(struct pt_regs *regs,
-		      struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
 	fr.fp = regs->ARM_fp;
 	fr.sp = regs->ARM_sp;
 	fr.lr = regs->ARM_lr;
 	fr.pc = regs->ARM_pc;
 	walk_stackframe(&fr, callchain_trace, entry);
 }
-
-static void
-perf_do_callchain(struct pt_regs *regs,
-		  struct perf_callchain_entry *entry)
-{
-	int is_user;
-
-	if (!regs)
-		return;
-
-	is_user = user_mode(regs);
-
-	if (!current || !current->pid)
-		return;
-
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
-	if (!is_user)
-		perf_callchain_kernel(regs, entry);
-
-	if (current->mm)
-		perf_callchain_user(regs, entry);
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-
-struct perf_callchain_entry *
-perf_callchain(struct pt_regs *regs)
-{
-	struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry);
-
-	entry->nr = 0;
-	perf_do_callchain(regs, entry);
-	return entry;
-}
diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c
index 29c0a911df2..77eb35c89cd 100644
--- a/arch/arm/mach-bcmring/dma.c
+++ b/arch/arm/mach-bcmring/dma.c
@@ -691,7 +691,7 @@ int dma_init(void)
 
 	memset(&gDMA, 0, sizeof(gDMA));
 
-	init_MUTEX_LOCKED(&gDMA.lock);
+	sema_init(&gDMA.lock, 0);
 	init_waitqueue_head(&gDMA.freeChannelQ);
 
 	/* Initialize the Hardware */
@@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap)
 {
 	memset(memMap, 0, sizeof(*memMap));
 
-	init_MUTEX(&memMap->lock);
+	sema_init(&memMap->lock, 1);
 
 	return 0;
 }
diff --git a/arch/arm/oprofile/Makefile b/arch/arm/oprofile/Makefile
index e666eafed15..b2215c61cdf 100644
--- a/arch/arm/oprofile/Makefile
+++ b/arch/arm/oprofile/Makefile
@@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprofilefs.o oprofile_stats.o \
 		timer_int.o )
 
+ifeq ($(CONFIG_HW_PERF_EVENTS),y)
+DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o)
+endif
+
 oprofile-y				:= $(DRIVER_OBJS) common.o
diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c
index 72e09eb642d..8aa974491df 100644
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -25,139 +25,10 @@
 #include <asm/ptrace.h>
 
 #ifdef CONFIG_HW_PERF_EVENTS
-/*
- * Per performance monitor configuration as set via oprofilefs.
- */
-struct op_counter_config {
-	unsigned long count;
-	unsigned long enabled;
-	unsigned long event;
-	unsigned long unit_mask;
-	unsigned long kernel;
-	unsigned long user;
-	struct perf_event_attr attr;
-};
-
-static int op_arm_enabled;
-static DEFINE_MUTEX(op_arm_mutex);
-
-static struct op_counter_config *counter_config;
-static struct perf_event **perf_events[nr_cpumask_bits];
-static int perf_num_counters;
-
-/*
- * Overflow callback for oprofile.
- */
-static void op_overflow_handler(struct perf_event *event, int unused,
-			struct perf_sample_data *data, struct pt_regs *regs)
+char *op_name_from_perf_id(void)
 {
-	int id;
-	u32 cpu = smp_processor_id();
-
-	for (id = 0; id < perf_num_counters; ++id)
-		if (perf_events[cpu][id] == event)
-			break;
-
-	if (id != perf_num_counters)
-		oprofile_add_sample(regs, id);
-	else
-		pr_warning("oprofile: ignoring spurious overflow "
-				"on cpu %u\n", cpu);
-}
-
-/*
- * Called by op_arm_setup to create perf attributes to mirror the oprofile
- * settings in counter_config. Attributes are created as `pinned' events and
- * so are permanently scheduled on the PMU.
- */
-static void op_perf_setup(void)
-{
-	int i;
-	u32 size = sizeof(struct perf_event_attr);
-	struct perf_event_attr *attr;
-
-	for (i = 0; i < perf_num_counters; ++i) {
-		attr = &counter_config[i].attr;
-		memset(attr, 0, size);
-		attr->type		= PERF_TYPE_RAW;
-		attr->size		= size;
-		attr->config		= counter_config[i].event;
-		attr->sample_period	= counter_config[i].count;
-		attr->pinned		= 1;
-	}
-}
-
-static int op_create_counter(int cpu, int event)
-{
-	int ret = 0;
-	struct perf_event *pevent;
-
-	if (!counter_config[event].enabled || (perf_events[cpu][event] != NULL))
-		return ret;
-
-	pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
-						  cpu, -1,
-						  op_overflow_handler);
-
-	if (IS_ERR(pevent)) {
-		ret = PTR_ERR(pevent);
-	} else if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
-		perf_event_release_kernel(pevent);
-		pr_warning("oprofile: failed to enable event %d "
-				"on CPU %d\n", event, cpu);
-		ret = -EBUSY;
-	} else {
-		perf_events[cpu][event] = pevent;
-	}
-
-	return ret;
-}
+	enum arm_perf_pmu_ids id = armpmu_get_pmu_id();
 
-static void op_destroy_counter(int cpu, int event)
-{
-	struct perf_event *pevent = perf_events[cpu][event];
-
-	if (pevent) {
-		perf_event_release_kernel(pevent);
-		perf_events[cpu][event] = NULL;
-	}
-}
-
-/*
- * Called by op_arm_start to create active perf events based on the
- * perviously configured attributes.
- */
-static int op_perf_start(void)
-{
-	int cpu, event, ret = 0;
-
-	for_each_online_cpu(cpu) {
-		for (event = 0; event < perf_num_counters; ++event) {
-			ret = op_create_counter(cpu, event);
-			if (ret)
-				goto out;
-		}
-	}
-
-out:
-	return ret;
-}
-
-/*
- * Called by op_arm_stop at the end of a profiling run.
- */
-static void op_perf_stop(void)
-{
-	int cpu, event;
-
-	for_each_online_cpu(cpu)
-		for (event = 0; event < perf_num_counters; ++event)
-			op_destroy_counter(cpu, event);
-}
-
-
-static char *op_name_from_perf_id(enum arm_perf_pmu_ids id)
-{
 	switch (id) {
 	case ARM_PERF_PMU_ID_XSCALE1:
 		return "arm/xscale1";
@@ -176,116 +47,6 @@ static char *op_name_from_perf_id(enum arm_perf_pmu_ids id)
 	}
 }
 
-static int op_arm_create_files(struct super_block *sb, struct dentry *root)
-{
-	unsigned int i;
-
-	for (i = 0; i < perf_num_counters; i++) {
-		struct dentry *dir;
-		char buf[4];
-
-		snprintf(buf, sizeof buf, "%d", i);
-		dir = oprofilefs_mkdir(sb, root, buf);
-		oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
-		oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
-		oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
-		oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
-		oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
-		oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
-	}
-
-	return 0;
-}
-
-static int op_arm_setup(void)
-{
-	spin_lock(&oprofilefs_lock);
-	op_perf_setup();
-	spin_unlock(&oprofilefs_lock);
-	return 0;
-}
-
-static int op_arm_start(void)
-{
-	int ret = -EBUSY;
-
-	mutex_lock(&op_arm_mutex);
-	if (!op_arm_enabled) {
-		ret = 0;
-		op_perf_start();
-		op_arm_enabled = 1;
-	}
-	mutex_unlock(&op_arm_mutex);
-	return ret;
-}
-
-static void op_arm_stop(void)
-{
-	mutex_lock(&op_arm_mutex);
-	if (op_arm_enabled)
-		op_perf_stop();
-	op_arm_enabled = 0;
-	mutex_unlock(&op_arm_mutex);
-}
-
-#ifdef CONFIG_PM
-static int op_arm_suspend(struct platform_device *dev, pm_message_t state)
-{
-	mutex_lock(&op_arm_mutex);
-	if (op_arm_enabled)
-		op_perf_stop();
-	mutex_unlock(&op_arm_mutex);
-	return 0;
-}
-
-static int op_arm_resume(struct platform_device *dev)
-{
-	mutex_lock(&op_arm_mutex);
-	if (op_arm_enabled && op_perf_start())
-		op_arm_enabled = 0;
-	mutex_unlock(&op_arm_mutex);
-	return 0;
-}
-
-static struct platform_driver oprofile_driver = {
-	.driver		= {
-		.name		= "arm-oprofile",
-	},
-	.resume		= op_arm_resume,
-	.suspend	= op_arm_suspend,
-};
-
-static struct platform_device *oprofile_pdev;
-
-static int __init init_driverfs(void)
-{
-	int ret;
-
-	ret = platform_driver_register(&oprofile_driver);
-	if (ret)
-		goto out;
-
-	oprofile_pdev =	platform_device_register_simple(
-				oprofile_driver.driver.name, 0, NULL, 0);
-	if (IS_ERR(oprofile_pdev)) {
-		ret = PTR_ERR(oprofile_pdev);
-		platform_driver_unregister(&oprofile_driver);
-	}
-
-out:
-	return ret;
-}
-
-static void  exit_driverfs(void)
-{
-	platform_device_unregister(oprofile_pdev);
-	platform_driver_unregister(&oprofile_driver);
-}
-#else
-static int __init init_driverfs(void) { return 0; }
-#define exit_driverfs() do { } while (0)
-#endif /* CONFIG_PM */
-
 static int report_trace(struct stackframe *frame, void *d)
 {
 	unsigned int *depth = d;
@@ -350,74 +111,14 @@ static void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
 
 int __init oprofile_arch_init(struct oprofile_operations *ops)
 {
-	int cpu, ret = 0;
-
-	perf_num_counters = armpmu_get_max_events();
-
-	counter_config = kcalloc(perf_num_counters,
-			sizeof(struct op_counter_config), GFP_KERNEL);
-
-	if (!counter_config) {
-		pr_info("oprofile: failed to allocate %d "
-				"counters\n", perf_num_counters);
-		return -ENOMEM;
-	}
-
-	ret = init_driverfs();
-	if (ret) {
-		kfree(counter_config);
-		counter_config = NULL;
-		return ret;
-	}
-
-	for_each_possible_cpu(cpu) {
-		perf_events[cpu] = kcalloc(perf_num_counters,
-				sizeof(struct perf_event *), GFP_KERNEL);
-		if (!perf_events[cpu]) {
-			pr_info("oprofile: failed to allocate %d perf events "
-					"for cpu %d\n", perf_num_counters, cpu);
-			while (--cpu >= 0)
-				kfree(perf_events[cpu]);
-			return -ENOMEM;
-		}
-	}
-
 	ops->backtrace		= arm_backtrace;
-	ops->create_files	= op_arm_create_files;
-	ops->setup		= op_arm_setup;
-	ops->start		= op_arm_start;
-	ops->stop		= op_arm_stop;
-	ops->shutdown		= op_arm_stop;
-	ops->cpu_type		= op_name_from_perf_id(armpmu_get_pmu_id());
-
-	if (!ops->cpu_type)
-		ret = -ENODEV;
-	else
-		pr_info("oprofile: using %s\n", ops->cpu_type);
 
-	return ret;
+	return oprofile_perf_init(ops);
 }
 
-void oprofile_arch_exit(void)
+void __exit oprofile_arch_exit(void)
 {
-	int cpu, id;
-	struct perf_event *event;
-
-	if (*perf_events) {
-		for_each_possible_cpu(cpu) {
-			for (id = 0; id < perf_num_counters; ++id) {
-				event = perf_events[cpu][id];
-				if (event != NULL)
-					perf_event_release_kernel(event);
-			}
-			kfree(perf_events[cpu]);
-		}
-	}
-
-	if (counter_config) {
-		kfree(counter_config);
-		exit_driverfs();
-	}
+	oprofile_perf_exit();
 }
 #else
 int __init oprofile_arch_init(struct oprofile_operations *ops)
@@ -425,5 +126,5 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 	pr_info("oprofile: hardware counters not available\n");
 	return -ENODEV;
 }
-void oprofile_arch_exit(void) {}
+void __exit oprofile_arch_exit(void) {}
 #endif /* CONFIG_HW_PERF_EVENTS */
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 16399bd2499..0f2417df632 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,6 +7,7 @@ config FRV
 	default y
 	select HAVE_IDE
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 
 config ZONE_DMA
diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile
index f4709756d0d..4ff2fb1e6b1 100644
--- a/arch/frv/lib/Makefile
+++ b/arch/frv/lib/Makefile
@@ -5,4 +5,4 @@
 lib-y := \
 	__ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
 	checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \
-	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o
+	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o
diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c
deleted file mode 100644
index 9ac5acfd2e9..00000000000
--- a/arch/frv/lib/perf_event.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance event handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_event.h>
-
-/*
- * mark the performance event as pending
- */
-void set_perf_event_pending(void)
-{
-}
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
index d514cd9edb4..8fb7d33a661 100644
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -6,12 +6,6 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-#include <asm/processor.h>
-
 /*
  * No irq_cpustat_t for IA-64.  The data is held in the per-CPU data structure.
  */
@@ -20,6 +14,11 @@
 
 #define local_softirq_pending()		(local_cpu_data->softirq_pending)
 
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+#include <asm/processor.h>
+
 extern void __iomem *ipi_base_addr;
 
 void ack_bad_irq(unsigned int irq);
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 9f342a574ce..dd028f2b13b 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -272,10 +272,6 @@ void cpu_idle_wait(void);
 
 void default_idle(void);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void account_system_vtime(struct task_struct *);
-#endif
-
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 2340f11dc29..9a526ba6f25 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
-	retval = security_task_setscheduler(p, 0, NULL);
+	retval = security_task_setscheduler(p)
 	if (retval)
 		goto out_unlock;
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 907417d187e..79a04a9394d 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -16,6 +16,7 @@ config PARISC
 	select RTC_DRV_GENERIC
 	select INIT_ALL_POSSIBLE
 	select BUG
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
 	help
diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h
index cc146427d8f..1e0fd8ba6c0 100644
--- a/arch/parisc/include/asm/perf_event.h
+++ b/arch/parisc/include/asm/perf_event.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_PARISC_PERF_EVENT_H
 #define __ASM_PARISC_PERF_EVENT_H
 
-/* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
+/* Empty, just to avoid compiling error */
 
 #endif /* __ASM_PARISC_PERF_EVENT_H */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 631e5a0fb6a..4b1e521d966 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 1ff6662f7fa..9b287fdd8ea 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -129,7 +129,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
-	u8 perf_event_pending;		/* PM interrupt while soft-disabled */
+	u8 irq_work_pending;		/* IRQ_WORK interrupt while soft-disable */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index 6c294acac84..9c3d160670b 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long);
 
 #define PTRRELOC(x)	((typeof(x)) add_reloc_offset((unsigned long)(x)))
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void account_system_vtime(struct task_struct *);
-#endif
-
 extern struct dentry *powerpc_debugfs_root;
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
index 95ad9dad298..d05ae4204bb 100644
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -23,18 +23,6 @@
 #include "ppc32.h"
 #endif
 
-/*
- * Store another value in a callchain_entry.
- */
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-	unsigned int nr = entry->nr;
-
-	if (nr < PERF_MAX_STACK_DEPTH) {
-		entry->ip[nr] = ip;
-		entry->nr = nr + 1;
-	}
-}
 
 /*
  * Is sp valid as the address of the next kernel stack frame after prev_sp?
@@ -58,8 +46,8 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
 	return 0;
 }
 
-static void perf_callchain_kernel(struct pt_regs *regs,
-				  struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
 	unsigned long next_ip;
@@ -69,8 +57,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
 
 	lr = regs->link;
 	sp = regs->gpr[1];
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->nip);
+	perf_callchain_store(entry, regs->nip);
 
 	if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
 		return;
@@ -89,7 +76,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
 			next_ip = regs->nip;
 			lr = regs->link;
 			level = 0;
-			callchain_store(entry, PERF_CONTEXT_KERNEL);
+			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 
 		} else {
 			if (level == 0)
@@ -111,7 +98,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
 			++level;
 		}
 
-		callchain_store(entry, next_ip);
+		perf_callchain_store(entry, next_ip);
 		if (!valid_next_sp(next_sp, sp))
 			return;
 		sp = next_sp;
@@ -233,8 +220,8 @@ static int sane_signal_64_frame(unsigned long sp)
 		puc == (unsigned long) &sf->uc;
 }
 
-static void perf_callchain_user_64(struct pt_regs *regs,
-				   struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+				   struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
 	unsigned long next_ip;
@@ -246,8 +233,7 @@ static void perf_callchain_user_64(struct pt_regs *regs,
 	next_ip = regs->nip;
 	lr = regs->link;
 	sp = regs->gpr[1];
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, next_ip);
+	perf_callchain_store(entry, next_ip);
 
 	for (;;) {
 		fp = (unsigned long __user *) sp;
@@ -276,14 +262,14 @@ static void perf_callchain_user_64(struct pt_regs *regs,
 			    read_user_stack_64(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			callchain_store(entry, PERF_CONTEXT_USER);
-			callchain_store(entry, next_ip);
+			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store(entry, next_ip);
 			continue;
 		}
 
 		if (level == 0)
 			next_ip = lr;
-		callchain_store(entry, next_ip);
+		perf_callchain_store(entry, next_ip);
 		++level;
 		sp = next_sp;
 	}
@@ -315,8 +301,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
 	return __get_user_inatomic(*ret, ptr);
 }
 
-static inline void perf_callchain_user_64(struct pt_regs *regs,
-					  struct perf_callchain_entry *entry)
+static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+					  struct pt_regs *regs)
 {
 }
 
@@ -435,8 +421,8 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
 	return mctx->mc_gregs;
 }
 
-static void perf_callchain_user_32(struct pt_regs *regs,
-				   struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+				   struct pt_regs *regs)
 {
 	unsigned int sp, next_sp;
 	unsigned int next_ip;
@@ -447,8 +433,7 @@ static void perf_callchain_user_32(struct pt_regs *regs,
 	next_ip = regs->nip;
 	lr = regs->link;
 	sp = regs->gpr[1];
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, next_ip);
+	perf_callchain_store(entry, next_ip);
 
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		fp = (unsigned int __user *) (unsigned long) sp;
@@ -470,45 +455,24 @@ static void perf_callchain_user_32(struct pt_regs *regs,
 			    read_user_stack_32(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			callchain_store(entry, PERF_CONTEXT_USER);
-			callchain_store(entry, next_ip);
+			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store(entry, next_ip);
 			continue;
 		}
 
 		if (level == 0)
 			next_ip = lr;
-		callchain_store(entry, next_ip);
+		perf_callchain_store(entry, next_ip);
 		++level;
 		sp = next_sp;
 	}
 }
 
-/*
- * Since we can't get PMU interrupts inside a PMU interrupt handler,
- * we don't need separate irq and nmi entries here.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, cpu_perf_callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	struct perf_callchain_entry *entry = &__get_cpu_var(cpu_perf_callchain);
-
-	entry->nr = 0;
-
-	if (!user_mode(regs)) {
-		perf_callchain_kernel(regs, entry);
-		if (current->mm)
-			regs = task_pt_regs(current);
-		else
-			regs = NULL;
-	}
-
-	if (regs) {
-		if (current_is_64bit())
-			perf_callchain_user_64(regs, entry);
-		else
-			perf_callchain_user_32(regs, entry);
-	}
-
-	return entry;
+	if (current_is_64bit())
+		perf_callchain_user_64(entry, regs);
+	else
+		perf_callchain_user_32(entry, regs);
 }
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index d301a30445e..3129c855933 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -402,6 +402,9 @@ static void power_pmu_read(struct perf_event *event)
 {
 	s64 val, delta, prev;
 
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
 	if (!event->hw.idx)
 		return;
 	/*
@@ -517,7 +520,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
  * Disable all events to prevent PMU interrupts and to allow
  * events to be added or removed.
  */
-void hw_perf_disable(void)
+static void power_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -565,7 +568,7 @@ void hw_perf_disable(void)
  * If we were previously disabled and events were added, then
  * put the new config on the PMU.
  */
-void hw_perf_enable(void)
+static void power_pmu_enable(struct pmu *pmu)
 {
 	struct perf_event *event;
 	struct cpu_hw_events *cpuhw;
@@ -672,6 +675,8 @@ void hw_perf_enable(void)
 		}
 		local64_set(&event->hw.prev_count, val);
 		event->hw.idx = idx;
+		if (event->hw.state & PERF_HES_STOPPED)
+			val = 0;
 		write_pmc(idx, val);
 		perf_event_update_userpage(event);
 	}
@@ -727,7 +732,7 @@ static int collect_events(struct perf_event *group, int max_count,
  * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_pmu_enable(struct perf_event *event)
+static int power_pmu_add(struct perf_event *event, int ef_flags)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -735,7 +740,7 @@ static int power_pmu_enable(struct perf_event *event)
 	int ret = -EAGAIN;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	/*
 	 * Add the event to the list (if there is room)
@@ -749,6 +754,9 @@ static int power_pmu_enable(struct perf_event *event)
 	cpuhw->events[n0] = event->hw.config;
 	cpuhw->flags[n0] = event->hw.event_base;
 
+	if (!(ef_flags & PERF_EF_START))
+		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
 	/*
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be peformed
@@ -769,7 +777,7 @@ nocheck:
 
 	ret = 0;
  out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 	return ret;
 }
@@ -777,14 +785,14 @@ nocheck:
 /*
  * Remove a event from the PMU.
  */
-static void power_pmu_disable(struct perf_event *event)
+static void power_pmu_del(struct perf_event *event, int ef_flags)
 {
 	struct cpu_hw_events *cpuhw;
 	long i;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	power_pmu_read(event);
 
@@ -821,34 +829,60 @@ static void power_pmu_disable(struct perf_event *event)
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
 /*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
+ * POWER-PMU does not support disabling individual counters, hence
+ * program their cycle counter to their max value and ignore the interrupts.
  */
-static void power_pmu_unthrottle(struct perf_event *event)
+
+static void power_pmu_start(struct perf_event *event, int ef_flags)
+{
+	unsigned long flags;
+	s64 left;
+
+	if (!event->hw.idx || !event->hw.sample_period)
+		return;
+
+	if (!(event->hw.state & PERF_HES_STOPPED))
+		return;
+
+	if (ef_flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+	local_irq_save(flags);
+	perf_pmu_disable(event->pmu);
+
+	event->hw.state = 0;
+	left = local64_read(&event->hw.period_left);
+	write_pmc(event->hw.idx, left);
+
+	perf_event_update_userpage(event);
+	perf_pmu_enable(event->pmu);
+	local_irq_restore(flags);
+}
+
+static void power_pmu_stop(struct perf_event *event, int ef_flags)
 {
-	s64 val, left;
 	unsigned long flags;
 
 	if (!event->hw.idx || !event->hw.sample_period)
 		return;
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
+
 	power_pmu_read(event);
-	left = event->hw.sample_period;
-	event->hw.last_period = left;
-	val = 0;
-	if (left < 0x80000000L)
-		val = 0x80000000L - left;
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
+	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	write_pmc(event->hw.idx, 0);
+
 	perf_event_update_userpage(event);
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
@@ -857,10 +891,11 @@ static void power_pmu_unthrottle(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  */
-void power_pmu_start_txn(const struct pmu *pmu)
+void power_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
+	perf_pmu_disable(pmu);
 	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
@@ -870,11 +905,12 @@ void power_pmu_start_txn(const struct pmu *pmu)
  * Clear the flag and pmu::enable() will perform the
  * schedulability test.
  */
-void power_pmu_cancel_txn(const struct pmu *pmu)
+void power_pmu_cancel_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
+	perf_pmu_enable(pmu);
 }
 
 /*
@@ -882,7 +918,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
  * Perform the group schedulability test as a whole
  * Return 0 if success
  */
-int power_pmu_commit_txn(const struct pmu *pmu)
+int power_pmu_commit_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	long i, n;
@@ -901,19 +937,10 @@ int power_pmu_commit_txn(const struct pmu *pmu)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
+	perf_pmu_enable(pmu);
 	return 0;
 }
 
-struct pmu power_pmu = {
-	.enable		= power_pmu_enable,
-	.disable	= power_pmu_disable,
-	.read		= power_pmu_read,
-	.unthrottle	= power_pmu_unthrottle,
-	.start_txn	= power_pmu_start_txn,
-	.cancel_txn	= power_pmu_cancel_txn,
-	.commit_txn	= power_pmu_commit_txn,
-};
-
 /*
  * Return 1 if we might be able to put event on a limited PMC,
  * or 0 if not.
@@ -1014,7 +1041,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
 	return 0;
 }
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int power_pmu_event_init(struct perf_event *event)
 {
 	u64 ev;
 	unsigned long flags;
@@ -1026,25 +1053,27 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	struct cpu_hw_events *cpuhw;
 
 	if (!ppmu)
-		return ERR_PTR(-ENXIO);
+		return -ENOENT;
+
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
 		ev = event->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return ERR_PTR(-EOPNOTSUPP);
+			return -EOPNOTSUPP;
 		ev = ppmu->generic_events[ev];
 		break;
 	case PERF_TYPE_HW_CACHE:
 		err = hw_perf_cache_event(event->attr.config, &ev);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 		break;
 	case PERF_TYPE_RAW:
 		ev = event->attr.config;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		return -ENOENT;
 	}
+
 	event->hw.config_base = ev;
 	event->hw.idx = 0;
 
@@ -1063,7 +1092,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	 * XXX we should check if the task is an idle task.
 	 */
 	flags = 0;
-	if (event->ctx->task)
+	if (event->attach_state & PERF_ATTACH_TASK)
 		flags |= PPMU_ONLY_COUNT_RUN;
 
 	/*
@@ -1081,7 +1110,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 			 */
 			ev = normal_pmc_alternative(ev, flags);
 			if (!ev)
-				return ERR_PTR(-EINVAL);
+				return -EINVAL;
 		}
 	}
 
@@ -1095,19 +1124,19 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 		n = collect_events(event->group_leader, ppmu->n_counter - 1,
 				   ctrs, events, cflags);
 		if (n < 0)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 	events[n] = ev;
 	ctrs[n] = event;
 	cflags[n] = flags;
 	if (check_excludes(ctrs, cflags, n, 1))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	cpuhw = &get_cpu_var(cpu_hw_events);
 	err = power_check_constraints(cpuhw, events, cflags, n + 1);
 	put_cpu_var(cpu_hw_events);
 	if (err)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	event->hw.config = events[n];
 	event->hw.event_base = cflags[n];
@@ -1132,11 +1161,23 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	}
 	event->destroy = hw_perf_event_destroy;
 
-	if (err)
-		return ERR_PTR(err);
-	return &power_pmu;
+	return err;
 }
 
+struct pmu power_pmu = {
+	.pmu_enable	= power_pmu_enable,
+	.pmu_disable	= power_pmu_disable,
+	.event_init	= power_pmu_event_init,
+	.add		= power_pmu_add,
+	.del		= power_pmu_del,
+	.start		= power_pmu_start,
+	.stop		= power_pmu_stop,
+	.read		= power_pmu_read,
+	.start_txn	= power_pmu_start_txn,
+	.cancel_txn	= power_pmu_cancel_txn,
+	.commit_txn	= power_pmu_commit_txn,
+};
+
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -1149,6 +1190,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	s64 prev, delta, left;
 	int record = 0;
 
+	if (event->hw.state & PERF_HES_STOPPED) {
+		write_pmc(event->hw.idx, 0);
+		return;
+	}
+
 	/* we don't have to worry about interrupts here */
 	prev = local64_read(&event->hw.prev_count);
 	delta = (val - prev) & 0xfffffffful;
@@ -1171,6 +1217,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			val = 0x80000000LL - left;
 	}
 
+	write_pmc(event->hw.idx, val);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+
 	/*
 	 * Finally record data if requested.
 	 */
@@ -1183,23 +1234,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
 			perf_get_data_addr(regs, &data.addr);
 
-		if (perf_event_overflow(event, nmi, &data, regs)) {
-			/*
-			 * Interrupts are coming too fast - throttle them
-			 * by setting the event to 0, so it will be
-			 * at least 2^30 cycles until the next interrupt
-			 * (assuming each event counts at most 2 counts
-			 * per cycle).
-			 */
-			val = 0;
-			left = ~0ULL >> 1;
-		}
+		if (perf_event_overflow(event, nmi, &data, regs))
+			power_pmu_stop(event, 0);
 	}
-
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
-	perf_event_update_userpage(event);
 }
 
 /*
@@ -1342,6 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu)
 		freeze_events_kernel = MMCR0_FCHV;
 #endif /* CONFIG_PPC64 */
 
+	perf_pmu_register(&power_pmu);
 	perf_cpu_notifier(power_pmu_notifier);
 
 	return 0;
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
index 1ba45471ae4..7ecca59ddf7 100644
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -156,6 +156,9 @@ static void fsl_emb_pmu_read(struct perf_event *event)
 {
 	s64 val, delta, prev;
 
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
 	/*
 	 * Performance monitor interrupts come even when interrupts
 	 * are soft-disabled, as long as interrupts are hard-enabled.
@@ -177,7 +180,7 @@ static void fsl_emb_pmu_read(struct perf_event *event)
  * Disable all events to prevent PMU interrupts and to allow
  * events to be added or removed.
  */
-void hw_perf_disable(void)
+static void fsl_emb_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -216,7 +219,7 @@ void hw_perf_disable(void)
  * If we were previously disabled and events were added, then
  * put the new config on the PMU.
  */
-void hw_perf_enable(void)
+static void fsl_emb_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -262,8 +265,8 @@ static int collect_events(struct perf_event *group, int max_count,
 	return n;
 }
 
-/* perf must be disabled, context locked on entry */
-static int fsl_emb_pmu_enable(struct perf_event *event)
+/* context locked on entry */
+static int fsl_emb_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuhw;
 	int ret = -EAGAIN;
@@ -271,6 +274,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 	u64 val;
 	int i;
 
+	perf_pmu_disable(event->pmu);
 	cpuhw = &get_cpu_var(cpu_hw_events);
 
 	if (event->hw.config & FSL_EMB_EVENT_RESTRICTED)
@@ -301,6 +305,12 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 			val = 0x80000000L - left;
 	}
 	local64_set(&event->hw.prev_count, val);
+
+	if (!(flags & PERF_EF_START)) {
+		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+		val = 0;
+	}
+
 	write_pmc(i, val);
 	perf_event_update_userpage(event);
 
@@ -310,15 +320,17 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 	ret = 0;
  out:
 	put_cpu_var(cpu_hw_events);
+	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
-/* perf must be disabled, context locked on entry */
-static void fsl_emb_pmu_disable(struct perf_event *event)
+/* context locked on entry */
+static void fsl_emb_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuhw;
 	int i = event->hw.idx;
 
+	perf_pmu_disable(event->pmu);
 	if (i < 0)
 		goto out;
 
@@ -346,44 +358,57 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
 	cpuhw->n_events--;
 
  out:
+	perf_pmu_enable(event->pmu);
 	put_cpu_var(cpu_hw_events);
 }
 
-/*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
- *
- * Context is locked on entry, but perf is not disabled.
- */
-static void fsl_emb_pmu_unthrottle(struct perf_event *event)
+static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags)
 {
-	s64 val, left;
 	unsigned long flags;
+	s64 left;
 
 	if (event->hw.idx < 0 || !event->hw.sample_period)
 		return;
+
+	if (!(event->hw.state & PERF_HES_STOPPED))
+		return;
+
+	if (ef_flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
 	local_irq_save(flags);
-	perf_disable();
-	fsl_emb_pmu_read(event);
-	left = event->hw.sample_period;
-	event->hw.last_period = left;
-	val = 0;
-	if (left < 0x80000000L)
-		val = 0x80000000L - left;
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
+	perf_pmu_disable(event->pmu);
+
+	event->hw.state = 0;
+	left = local64_read(&event->hw.period_left);
+	write_pmc(event->hw.idx, left);
+
 	perf_event_update_userpage(event);
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
-static struct pmu fsl_emb_pmu = {
-	.enable		= fsl_emb_pmu_enable,
-	.disable	= fsl_emb_pmu_disable,
-	.read		= fsl_emb_pmu_read,
-	.unthrottle	= fsl_emb_pmu_unthrottle,
-};
+static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags)
+{
+	unsigned long flags;
+
+	if (event->hw.idx < 0 || !event->hw.sample_period)
+		return;
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	local_irq_save(flags);
+	perf_pmu_disable(event->pmu);
+
+	fsl_emb_pmu_read(event);
+	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	write_pmc(event->hw.idx, 0);
+
+	perf_event_update_userpage(event);
+	perf_pmu_enable(event->pmu);
+	local_irq_restore(flags);
+}
 
 /*
  * Release the PMU if this is the last perf_event.
@@ -428,7 +453,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
 	return 0;
 }
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int fsl_emb_pmu_event_init(struct perf_event *event)
 {
 	u64 ev;
 	struct perf_event *events[MAX_HWEVENTS];
@@ -441,14 +466,14 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	case PERF_TYPE_HARDWARE:
 		ev = event->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return ERR_PTR(-EOPNOTSUPP);
+			return -EOPNOTSUPP;
 		ev = ppmu->generic_events[ev];
 		break;
 
 	case PERF_TYPE_HW_CACHE:
 		err = hw_perf_cache_event(event->attr.config, &ev);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 		break;
 
 	case PERF_TYPE_RAW:
@@ -456,12 +481,12 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 		break;
 
 	default:
-		return ERR_PTR(-EINVAL);
+		return -ENOENT;
 	}
 
 	event->hw.config = ppmu->xlate_event(ev);
 	if (!(event->hw.config & FSL_EMB_EVENT_VALID))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	/*
 	 * If this is in a group, check if it can go on with all the
@@ -473,7 +498,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 		n = collect_events(event->group_leader,
 		                   ppmu->n_counter - 1, events);
 		if (n < 0)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 
 	if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) {
@@ -484,7 +509,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 		}
 
 		if (num_restricted >= ppmu->n_restricted)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 
 	event->hw.idx = -1;
@@ -497,7 +522,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	if (event->attr.exclude_kernel)
 		event->hw.config_base |= PMLCA_FCS;
 	if (event->attr.exclude_idle)
-		return ERR_PTR(-ENOTSUPP);
+		return -ENOTSUPP;
 
 	event->hw.last_period = event->hw.sample_period;
 	local64_set(&event->hw.period_left, event->hw.last_period);
@@ -523,11 +548,20 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	}
 	event->destroy = hw_perf_event_destroy;
 
-	if (err)
-		return ERR_PTR(err);
-	return &fsl_emb_pmu;
+	return err;
 }
 
+static struct pmu fsl_emb_pmu = {
+	.pmu_enable	= fsl_emb_pmu_enable,
+	.pmu_disable	= fsl_emb_pmu_disable,
+	.event_init	= fsl_emb_pmu_event_init,
+	.add		= fsl_emb_pmu_add,
+	.del		= fsl_emb_pmu_del,
+	.start		= fsl_emb_pmu_start,
+	.stop		= fsl_emb_pmu_stop,
+	.read		= fsl_emb_pmu_read,
+};
+
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -540,6 +574,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	s64 prev, delta, left;
 	int record = 0;
 
+	if (event->hw.state & PERF_HES_STOPPED) {
+		write_pmc(event->hw.idx, 0);
+		return;
+	}
+
 	/* we don't have to worry about interrupts here */
 	prev = local64_read(&event->hw.prev_count);
 	delta = (val - prev) & 0xfffffffful;
@@ -562,6 +601,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			val = 0x80000000LL - left;
 	}
 
+	write_pmc(event->hw.idx, val);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+
 	/*
 	 * Finally record data if requested.
 	 */
@@ -571,23 +615,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
-		if (perf_event_overflow(event, nmi, &data, regs)) {
-			/*
-			 * Interrupts are coming too fast - throttle them
-			 * by setting the event to 0, so it will be
-			 * at least 2^30 cycles until the next interrupt
-			 * (assuming each event counts at most 2 counts
-			 * per cycle).
-			 */
-			val = 0;
-			left = ~0ULL >> 1;
-		}
+		if (perf_event_overflow(event, nmi, &data, regs))
+			fsl_emb_pmu_stop(event, 0);
 	}
-
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
-	perf_event_update_userpage(event);
 }
 
 static void perf_event_interrupt(struct pt_regs *regs)
@@ -651,5 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
 	pr_info("%s performance monitor hardware support registered\n",
 		pmu->name);
 
+	perf_pmu_register(&fsl_emb_pmu);
+
 	return 0;
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 8533b3b83f5..54888eb10c3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <asm/trace.h>
 
 #include <asm/io.h>
@@ -493,60 +493,60 @@ void __init iSeries_time_init_early(void)
 }
 #endif /* CONFIG_PPC_ISERIES */
 
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
 
 /*
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  */
 #ifdef CONFIG_PPC64
-static inline unsigned long test_perf_event_pending(void)
+static inline unsigned long test_irq_work_pending(void)
 {
 	unsigned long x;
 
 	asm volatile("lbz %0,%1(13)"
 		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, perf_event_pending)));
+		: "i" (offsetof(struct paca_struct, irq_work_pending)));
 	return x;
 }
 
-static inline void set_perf_event_pending_flag(void)
+static inline void set_irq_work_pending_flag(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (1),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
 }
 
-static inline void clear_perf_event_pending(void)
+static inline void clear_irq_work_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (0),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
 }
 
 #else /* 32-bit */
 
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
 
-#define set_perf_event_pending_flag()	__get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending()	__get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending()	__get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag()	__get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending()		__get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending()	__get_cpu_var(irq_work_pending) = 0
 
 #endif /* 32 vs 64 bit */
 
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
 {
 	preempt_disable();
-	set_perf_event_pending_flag();
+	set_irq_work_pending_flag();
 	set_dec(1);
 	preempt_enable();
 }
 
-#else  /* CONFIG_PERF_EVENTS */
+#else  /* CONFIG_IRQ_WORK */
 
-#define test_perf_event_pending()	0
-#define clear_perf_event_pending()
+#define test_irq_work_pending()	0
+#define clear_irq_work_pending()
 
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
 
 /*
  * For iSeries shared processors, we have to let the hypervisor
@@ -587,9 +587,9 @@ void timer_interrupt(struct pt_regs * regs)
 
 	calculate_steal_time();
 
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
+	if (test_irq_work_pending()) {
+		clear_irq_work_pending();
+		irq_work_run();
 	}
 
 #ifdef CONFIG_PPC_ISERIES
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f0777a47e3a..75976a14194 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -95,6 +95,7 @@ config S390
 	select HAVE_KVM if 64BIT
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
@@ -198,6 +199,13 @@ config HOTPLUG_CPU
 	  can be controlled through /sys/devices/system/cpu/cpu#.
 	  Say N if you want to disable CPU hotplug.
 
+config SCHED_BOOK
+	bool "Book scheduler support"
+	depends on SMP
+	help
+	  Book scheduler support improves the CPU scheduler's decision making
+	  when dealing with machines that have several books.
+
 config MATHEMU
 	bool "IEEE FPU emulation"
 	depends on MARCH_G5
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index 498bc389238..881d94590ae 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -12,10 +12,6 @@
 #ifndef __ASM_HARDIRQ_H
 #define __ASM_HARDIRQ_H
 
-#include <linux/threads.h>
-#include <linux/sched.h>
-#include <linux/cache.h>
-#include <linux/interrupt.h>
 #include <asm/lowcore.h>
 
 #define local_softirq_pending() (S390_lowcore.softirq_pending)
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 3840cbe7763..a75f168d271 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -4,7 +4,6 @@
  * Copyright 2009 Martin Schwidefsky, IBM Corporation.
  */
 
-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
+/* Empty, just to avoid compiling error */
 
 #define PERF_EVENT_INDEX_OFFSET 0
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index cef66210c84..38ddd8a9a9e 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs)
 
 extern void account_vtime(struct task_struct *, struct task_struct *);
 extern void account_tick_vtime(struct task_struct *);
-extern void account_system_vtime(struct task_struct *);
 
 #ifdef CONFIG_PFAULT
 extern void pfault_irq_init(void);
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 831bd033ea7..051107a2c5e 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -3,15 +3,32 @@
 
 #include <linux/cpumask.h>
 
-#define mc_capable()	(1)
-
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
-
 extern unsigned char cpu_core_id[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 
+static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+{
+	return &cpu_core_map[cpu];
+}
+
 #define topology_core_id(cpu)		(cpu_core_id[cpu])
 #define topology_core_cpumask(cpu)	(&cpu_core_map[cpu])
+#define mc_capable()			(1)
+
+#ifdef CONFIG_SCHED_BOOK
+
+extern unsigned char cpu_book_id[NR_CPUS];
+extern cpumask_t cpu_book_map[NR_CPUS];
+
+static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
+{
+	return &cpu_book_map[cpu];
+}
+
+#define topology_book_id(cpu)		(cpu_book_id[cpu])
+#define topology_book_cpumask(cpu)	(&cpu_book_map[cpu])
+
+#endif /* CONFIG_SCHED_BOOK */
 
 int topology_set_cpu_management(int fc);
 void topology_schedule_update(void);
@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
 };
 #endif
 
+#define SD_BOOK_INIT	SD_CPU_INIT
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index bcef00766a6..13559c99384 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -57,8 +57,8 @@ struct tl_info {
 	union tl_entry tle[0];
 };
 
-struct core_info {
-	struct core_info *next;
+struct mask_info {
+	struct mask_info *next;
 	unsigned char id;
 	cpumask_t mask;
 };
@@ -66,7 +66,6 @@ struct core_info {
 static int topology_enabled;
 static void topology_work_fn(struct work_struct *work);
 static struct tl_info *tl_info;
-static struct core_info core_info;
 static int machine_has_topology;
 static struct timer_list topology_timer;
 static void set_topology_timer(void);
@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
 /* topology_lock protects the core linked list */
 static DEFINE_SPINLOCK(topology_lock);
 
+static struct mask_info core_info;
 cpumask_t cpu_core_map[NR_CPUS];
 unsigned char cpu_core_id[NR_CPUS];
 
-static cpumask_t cpu_coregroup_map(unsigned int cpu)
+#ifdef CONFIG_SCHED_BOOK
+static struct mask_info book_info;
+cpumask_t cpu_book_map[NR_CPUS];
+unsigned char cpu_book_id[NR_CPUS];
+#endif
+
+static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
 {
-	struct core_info *core = &core_info;
-	unsigned long flags;
 	cpumask_t mask;
 
 	cpus_clear(mask);
 	if (!topology_enabled || !machine_has_topology)
 		return cpu_possible_map;
-	spin_lock_irqsave(&topology_lock, flags);
-	while (core) {
-		if (cpu_isset(cpu, core->mask)) {
-			mask = core->mask;
+	while (info) {
+		if (cpu_isset(cpu, info->mask)) {
+			mask = info->mask;
 			break;
 		}
-		core = core->next;
+		info = info->next;
 	}
-	spin_unlock_irqrestore(&topology_lock, flags);
 	if (cpus_empty(mask))
 		mask = cpumask_of_cpu(cpu);
 	return mask;
 }
 
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
-{
-	return &cpu_core_map[cpu];
-}
-
-static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
+static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
+			     struct mask_info *core)
 {
 	unsigned int cpu;
 
@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
 
 		rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
 		for_each_present_cpu(lcpu) {
-			if (cpu_logical_map(lcpu) == rcpu) {
-				cpu_set(lcpu, core->mask);
-				cpu_core_id[lcpu] = core->id;
-				smp_cpu_polarization[lcpu] = tl_cpu->pp;
-			}
+			if (cpu_logical_map(lcpu) != rcpu)
+				continue;
+#ifdef CONFIG_SCHED_BOOK
+			cpu_set(lcpu, book->mask);
+			cpu_book_id[lcpu] = book->id;
+#endif
+			cpu_set(lcpu, core->mask);
+			cpu_core_id[lcpu] = core->id;
+			smp_cpu_polarization[lcpu] = tl_cpu->pp;
 		}
 	}
 }
 
-static void clear_cores(void)
+static void clear_masks(void)
 {
-	struct core_info *core = &core_info;
+	struct mask_info *info;
 
-	while (core) {
-		cpus_clear(core->mask);
-		core = core->next;
+	info = &core_info;
+	while (info) {
+		cpus_clear(info->mask);
+		info = info->next;
+	}
+#ifdef CONFIG_SCHED_BOOK
+	info = &book_info;
+	while (info) {
+		cpus_clear(info->mask);
+		info = info->next;
 	}
+#endif
 }
 
 static union tl_entry *next_tle(union tl_entry *tle)
@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
 
 static void tl_to_cores(struct tl_info *info)
 {
+#ifdef CONFIG_SCHED_BOOK
+	struct mask_info *book = &book_info;
+#else
+	struct mask_info *book = NULL;
+#endif
+	struct mask_info *core = &core_info;
 	union tl_entry *tle, *end;
-	struct core_info *core = &core_info;
+
 
 	spin_lock_irq(&topology_lock);
-	clear_cores();
+	clear_masks();
 	tle = info->tle;
 	end = (union tl_entry *)((unsigned long)info + info->length);
 	while (tle < end) {
 		switch (tle->nl) {
-		case 5:
-		case 4:
-		case 3:
+#ifdef CONFIG_SCHED_BOOK
 		case 2:
+			book = book->next;
+			book->id = tle->container.id;
 			break;
+#endif
 		case 1:
 			core = core->next;
 			core->id = tle->container.id;
 			break;
 		case 0:
-			add_cpus_to_core(&tle->cpu, core);
+			add_cpus_to_mask(&tle->cpu, book, core);
 			break;
 		default:
-			clear_cores();
+			clear_masks();
 			machine_has_topology = 0;
 			goto out;
 		}
@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
 
 static void update_cpu_core_map(void)
 {
+	unsigned long flags;
 	int cpu;
 
-	for_each_possible_cpu(cpu)
-		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
+	spin_lock_irqsave(&topology_lock, flags);
+	for_each_possible_cpu(cpu) {
+		cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
+#ifdef CONFIG_SCHED_BOOK
+		cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
+#endif
+	}
+	spin_unlock_irqrestore(&topology_lock, flags);
+}
+
+static void store_topology(struct tl_info *info)
+{
+#ifdef CONFIG_SCHED_BOOK
+	int rc;
+
+	rc = stsi(info, 15, 1, 3);
+	if (rc != -ENOSYS)
+		return;
+#endif
+	stsi(info, 15, 1, 2);
 }
 
 int arch_update_cpu_topology(void)
@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
 		topology_update_polarization_simple();
 		return 0;
 	}
-	stsi(info, 15, 1, 2);
+	store_topology(info);
 	tl_to_cores(info);
 	update_cpu_core_map();
 	for_each_online_cpu(cpu) {
@@ -299,12 +335,24 @@ out:
 }
 __initcall(init_topology_update);
 
+static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
+{
+	int i, nr_masks;
+
+	nr_masks = info->mag[NR_MAG - offset];
+	for (i = 0; i < info->mnest - offset; i++)
+		nr_masks *= info->mag[NR_MAG - offset - 1 - i];
+	nr_masks = max(nr_masks, 1);
+	for (i = 0; i < nr_masks; i++) {
+		mask->next = alloc_bootmem(sizeof(struct mask_info));
+		mask = mask->next;
+	}
+}
+
 void __init s390_init_cpu_topology(void)
 {
 	unsigned long long facility_bits;
 	struct tl_info *info;
-	struct core_info *core;
-	int nr_cores;
 	int i;
 
 	if (stfle(&facility_bits, 1) <= 0)
@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
 
 	tl_info = alloc_bootmem_pages(PAGE_SIZE);
 	info = tl_info;
-	stsi(info, 15, 1, 2);
-
-	nr_cores = info->mag[NR_MAG - 2];
-	for (i = 0; i < info->mnest - 2; i++)
-		nr_cores *= info->mag[NR_MAG - 3 - i];
-
+	store_topology(info);
 	pr_info("The CPU configuration topology of the machine is:");
 	for (i = 0; i < NR_MAG; i++)
 		printk(" %d", info->mag[i]);
 	printk(" / %d\n", info->mnest);
-
-	core = &core_info;
-	for (i = 0; i < nr_cores; i++) {
-		core->next = alloc_bootmem(sizeof(struct core_info));
-		core = core->next;
-		if (!core)
-			goto error;
-	}
-	return;
-error:
-	machine_has_topology = 0;
+	alloc_masks(info, &core_info, 2);
+#ifdef CONFIG_SCHED_BOOK
+	alloc_masks(info, &book_info, 3);
+#endif
 }
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 33990fa95af..35b6879628a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
@@ -249,6 +250,11 @@ config ARCH_SHMOBILE
 	select PM
 	select PM_RUNTIME
 
+config CPU_HAS_PMU
+       depends on CPU_SH4 || CPU_SH4A
+       default y
+       bool
+
 if SUPERH32
 
 choice
@@ -738,6 +744,14 @@ config GUSA_RB
 	  LLSC, this should be more efficient than the other alternative of
 	  disabling interrupts around the atomic sequence.
 
+config HW_PERF_EVENTS
+	bool "Enable hardware performance counter support for perf events"
+	depends on PERF_EVENTS && CPU_HAS_PMU
+	default y
+	help
+	  Enable hardware performance counter support for perf events. If
+	  disabled, perf events will use software events only.
+
 source "drivers/sh/Kconfig"
 
 endmenu
diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h
index 3d0c9f36d15..14308bed7ea 100644
--- a/arch/sh/include/asm/perf_event.h
+++ b/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,4 @@ extern int register_sh_pmu(struct sh_pmu *);
 extern int reserve_pmc_hardware(void);
 extern void release_pmc_hardware(void);
 
-static inline void set_perf_event_pending(void)
-{
-	/* Nothing to see here, move along. */
-}
-
-#define PERF_EVENT_INDEX_OFFSET	0
-
 #endif /* __ASM_SH_PERF_EVENT_H */
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c
index a9dd3abde28..d5ca1ef50fa 100644
--- a/arch/sh/kernel/perf_callchain.c
+++ b/arch/sh/kernel/perf_callchain.c
@@ -14,11 +14,6 @@
 #include <asm/unwinder.h>
 #include <asm/ptrace.h>
 
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
-		entry->ip[entry->nr++] = ip;
-}
 
 static void callchain_warning(void *data, char *msg)
 {
@@ -39,7 +34,7 @@ static void callchain_address(void *data, unsigned long addr, int reliable)
 	struct perf_callchain_entry *entry = data;
 
 	if (reliable)
-		callchain_store(entry, addr);
+		perf_callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops callchain_ops = {
@@ -49,47 +44,10 @@ static const struct stacktrace_ops callchain_ops = {
 	.address	= callchain_address,
 };
 
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->pc);
+	perf_callchain_store(entry, regs->pc);
 
 	unwind_stack(NULL, regs, NULL, &callchain_ops, entry);
 }
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	int is_user;
-
-	if (!regs)
-		return;
-
-	is_user = user_mode(regs);
-
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
-	/*
-	 * Only the kernel side is implemented for now.
-	 */
-	if (!is_user)
-		perf_callchain_kernel(regs, entry);
-}
-
-/*
- * No need for separate IRQ and NMI entries.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
-	entry->nr = 0;
-
-	perf_do_callchain(regs, entry);
-
-	return entry;
-}
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 7a3dc356725..5a4b3343565 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -59,6 +59,24 @@ static inline int sh_pmu_initialized(void)
 	return !!sh_pmu;
 }
 
+const char *perf_pmu_name(void)
+{
+	if (!sh_pmu)
+		return NULL;
+
+	return sh_pmu->name;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_name);
+
+int perf_num_counters(void)
+{
+	if (!sh_pmu)
+		return 0;
+
+	return sh_pmu->num_events;
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
 /*
  * Release the PMU if this is the last perf_event.
  */
@@ -206,50 +224,80 @@ again:
 	local64_add(delta, &event->count);
 }
 
-static void sh_pmu_disable(struct perf_event *event)
+static void sh_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
-	clear_bit(idx, cpuc->active_mask);
-	sh_pmu->disable(hwc, idx);
+	if (!(event->hw.state & PERF_HES_STOPPED)) {
+		sh_pmu->disable(hwc, idx);
+		cpuc->events[idx] = NULL;
+		event->hw.state |= PERF_HES_STOPPED;
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
+		sh_perf_event_update(event, &event->hw, idx);
+		event->hw.state |= PERF_HES_UPTODATE;
+	}
+}
+
+static void sh_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
 
-	barrier();
+	cpuc->events[idx] = event;
+	event->hw.state = 0;
+	sh_pmu->enable(hwc, idx);
+}
 
-	sh_perf_event_update(event, &event->hw, idx);
+static void sh_pmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->events[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
+	sh_pmu_stop(event, PERF_EF_UPDATE);
+	__clear_bit(event->hw.idx, cpuc->used_mask);
 
 	perf_event_update_userpage(event);
 }
 
-static int sh_pmu_enable(struct perf_event *event)
+static int sh_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
+	int ret = -EAGAIN;
+
+	perf_pmu_disable(event->pmu);
 
-	if (test_and_set_bit(idx, cpuc->used_mask)) {
+	if (__test_and_set_bit(idx, cpuc->used_mask)) {
 		idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
 		if (idx == sh_pmu->num_events)
-			return -EAGAIN;
+			goto out;
 
-		set_bit(idx, cpuc->used_mask);
+		__set_bit(idx, cpuc->used_mask);
 		hwc->idx = idx;
 	}
 
 	sh_pmu->disable(hwc, idx);
 
-	cpuc->events[idx] = event;
-	set_bit(idx, cpuc->active_mask);
-
-	sh_pmu->enable(hwc, idx);
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (flags & PERF_EF_START)
+		sh_pmu_start(event, PERF_EF_RELOAD);
 
 	perf_event_update_userpage(event);
-
-	return 0;
+	ret = 0;
+out:
+	perf_pmu_enable(event->pmu);
+	return ret;
 }
 
 static void sh_pmu_read(struct perf_event *event)
@@ -257,24 +305,56 @@ static void sh_pmu_read(struct perf_event *event)
 	sh_perf_event_update(event, &event->hw, event->hw.idx);
 }
 
-static const struct pmu pmu = {
-	.enable		= sh_pmu_enable,
-	.disable	= sh_pmu_disable,
-	.read		= sh_pmu_read,
-};
-
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int sh_pmu_event_init(struct perf_event *event)
 {
-	int err = __hw_perf_event_init(event);
+	int err;
+
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HW_CACHE:
+	case PERF_TYPE_HARDWARE:
+		err = __hw_perf_event_init(event);
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
 	if (unlikely(err)) {
 		if (event->destroy)
 			event->destroy(event);
-		return ERR_PTR(err);
 	}
 
-	return &pmu;
+	return err;
+}
+
+static void sh_pmu_enable(struct pmu *pmu)
+{
+	if (!sh_pmu_initialized())
+		return;
+
+	sh_pmu->enable_all();
+}
+
+static void sh_pmu_disable(struct pmu *pmu)
+{
+	if (!sh_pmu_initialized())
+		return;
+
+	sh_pmu->disable_all();
 }
 
+static struct pmu pmu = {
+	.pmu_enable	= sh_pmu_enable,
+	.pmu_disable	= sh_pmu_disable,
+	.event_init	= sh_pmu_event_init,
+	.add		= sh_pmu_add,
+	.del		= sh_pmu_del,
+	.start		= sh_pmu_start,
+	.stop		= sh_pmu_stop,
+	.read		= sh_pmu_read,
+};
+
 static void sh_pmu_setup(int cpu)
 {
 	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
@@ -299,32 +379,17 @@ sh_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-void hw_perf_enable(void)
-{
-	if (!sh_pmu_initialized())
-		return;
-
-	sh_pmu->enable_all();
-}
-
-void hw_perf_disable(void)
-{
-	if (!sh_pmu_initialized())
-		return;
-
-	sh_pmu->disable_all();
-}
-
-int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
+int __cpuinit register_sh_pmu(struct sh_pmu *_pmu)
 {
 	if (sh_pmu)
 		return -EBUSY;
-	sh_pmu = pmu;
+	sh_pmu = _pmu;
 
-	pr_info("Performance Events: %s support registered\n", pmu->name);
+	pr_info("Performance Events: %s support registered\n", _pmu->name);
 
-	WARN_ON(pmu->num_events > MAX_HWEVENTS);
+	WARN_ON(_pmu->num_events > MAX_HWEVENTS);
 
+	perf_pmu_register(&pmu);
 	perf_cpu_notifier(sh_pmu_notifier);
 	return 0;
 }
diff --git a/arch/sh/oprofile/Makefile b/arch/sh/oprofile/Makefile
index 4886c5c1786..e85aae73e3d 100644
--- a/arch/sh/oprofile/Makefile
+++ b/arch/sh/oprofile/Makefile
@@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprofilefs.o oprofile_stats.o \
 		timer_int.o )
 
+ifeq ($(CONFIG_HW_PERF_EVENTS),y)
+DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o)
+endif
+
 oprofile-y	:= $(DRIVER_OBJS) common.o backtrace.o
diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c
index ac604937f3e..e10d89376f9 100644
--- a/arch/sh/oprofile/common.c
+++ b/arch/sh/oprofile/common.c
@@ -17,114 +17,45 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/smp.h>
+#include <linux/perf_event.h>
 #include <asm/processor.h>
-#include "op_impl.h"
-
-static struct op_sh_model *model;
-
-static struct op_counter_config ctr[20];
 
+#ifdef CONFIG_HW_PERF_EVENTS
 extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth);
 
-static int op_sh_setup(void)
-{
-	/* Pre-compute the values to stuff in the hardware registers.  */
-	model->reg_setup(ctr);
-
-	/* Configure the registers on all cpus.  */
-	on_each_cpu(model->cpu_setup, NULL, 1);
-
-        return 0;
-}
-
-static int op_sh_create_files(struct super_block *sb, struct dentry *root)
+char *op_name_from_perf_id(void)
 {
-	int i, ret = 0;
+	const char *pmu;
+	char buf[20];
+	int size;
 
-	for (i = 0; i < model->num_counters; i++) {
-		struct dentry *dir;
-		char buf[4];
+	pmu = perf_pmu_name();
+	if (!pmu)
+		return NULL;
 
-		snprintf(buf, sizeof(buf), "%d", i);
-		dir = oprofilefs_mkdir(sb, root, buf);
+	size = snprintf(buf, sizeof(buf), "sh/%s", pmu);
+	if (size > -1 && size < sizeof(buf))
+		return buf;
 
-		ret |= oprofilefs_create_ulong(sb, dir, "enabled", &ctr[i].enabled);
-		ret |= oprofilefs_create_ulong(sb, dir, "event", &ctr[i].event);
-		ret |= oprofilefs_create_ulong(sb, dir, "kernel", &ctr[i].kernel);
-		ret |= oprofilefs_create_ulong(sb, dir, "user", &ctr[i].user);
-
-		if (model->create_files)
-			ret |= model->create_files(sb, dir);
-		else
-			ret |= oprofilefs_create_ulong(sb, dir, "count", &ctr[i].count);
-
-		/* Dummy entries */
-		ret |= oprofilefs_create_ulong(sb, dir, "unit_mask", &ctr[i].unit_mask);
-	}
-
-	return ret;
+	return NULL;
 }
 
-static int op_sh_start(void)
+int __init oprofile_arch_init(struct oprofile_operations *ops)
 {
-	/* Enable performance monitoring for all counters.  */
-	on_each_cpu(model->cpu_start, NULL, 1);
+	ops->backtrace = sh_backtrace;
 
-	return 0;
+	return oprofile_perf_init(ops);
 }
 
-static void op_sh_stop(void)
+void __exit oprofile_arch_exit(void)
 {
-	/* Disable performance monitoring for all counters.  */
-	on_each_cpu(model->cpu_stop, NULL, 1);
+	oprofile_perf_exit();
 }
-
+#else
 int __init oprofile_arch_init(struct oprofile_operations *ops)
 {
-	struct op_sh_model *lmodel = NULL;
-	int ret;
-
-	/*
-	 * Always assign the backtrace op. If the counter initialization
-	 * fails, we fall back to the timer which will still make use of
-	 * this.
-	 */
-	ops->backtrace = sh_backtrace;
-
-	/*
-	 * XXX
-	 *
-	 * All of the SH7750/SH-4A counters have been converted to perf,
-	 * this infrastructure hook is left for other users until they've
-	 * had a chance to convert over, at which point all of this
-	 * will be deleted.
-	 */
-
-	if (!lmodel)
-		return -ENODEV;
-	if (!(current_cpu_data.flags & CPU_HAS_PERF_COUNTER))
-		return -ENODEV;
-
-	ret = lmodel->init();
-	if (unlikely(ret != 0))
-		return ret;
-
-	model = lmodel;
-
-	ops->setup		= op_sh_setup;
-	ops->create_files	= op_sh_create_files;
-	ops->start		= op_sh_start;
-	ops->stop		= op_sh_stop;
-	ops->cpu_type		= lmodel->cpu_type;
-
-	printk(KERN_INFO "oprofile: using %s performance monitoring.\n",
-	       lmodel->cpu_type);
-
-	return 0;
-}
-
-void oprofile_arch_exit(void)
-{
-	if (model && model->exit)
-		model->exit();
+	pr_info("oprofile: hardware counters not available\n");
+	return -ENODEV;
 }
+void __exit oprofile_arch_exit(void) {}
+#endif /* CONFIG_HW_PERF_EVENTS */
diff --git a/arch/sh/oprofile/op_impl.h b/arch/sh/oprofile/op_impl.h
deleted file mode 100644
index 1244479ceb2..00000000000
--- a/arch/sh/oprofile/op_impl.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __OP_IMPL_H
-#define __OP_IMPL_H
-
-/* Per-counter configuration as set via oprofilefs.  */
-struct op_counter_config {
-	unsigned long enabled;
-	unsigned long event;
-
-	unsigned long count;
-
-	/* Dummy values for userspace tool compliance */
-	unsigned long kernel;
-	unsigned long user;
-	unsigned long unit_mask;
-};
-
-/* Per-architecture configury and hooks.  */
-struct op_sh_model {
-	void (*reg_setup)(struct op_counter_config *);
-	int (*create_files)(struct super_block *sb, struct dentry *dir);
-	void (*cpu_setup)(void *dummy);
-	int (*init)(void);
-	void (*exit)(void);
-	void (*cpu_start)(void *args);
-	void (*cpu_stop)(void *args);
-	char *cpu_type;
-	unsigned char num_counters;
-};
-
-/* arch/sh/oprofile/common.c */
-extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth);
-
-#endif /* __OP_IMPL_H */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 491e9d6de19..3e9d31401fb 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,10 +26,12 @@ config SPARC
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
+	select HAVE_ARCH_JUMP_LABEL
 
 config SPARC32
 	def_bool !64BIT
@@ -53,6 +55,7 @@ config SPARC64
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
 	select RTC_DRV_STARFIRE
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 
diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h
new file mode 100644
index 00000000000..62e66d7b2fb
--- /dev/null
+++ b/arch/sparc/include/asm/jump_label.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_SPARC_JUMP_LABEL_H
+#define _ASM_SPARC_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/system.h>
+
+#define JUMP_LABEL_NOP_SIZE 4
+
+#define JUMP_LABEL(key, label)					\
+	do {							\
+		asm goto("1:\n\t"				\
+			 "nop\n\t"				\
+			 "nop\n\t"				\
+			 ".pushsection __jump_table,  \"a\"\n\t"\
+			 ".word 1b, %l[" #label "], %c0\n\t"	\
+			 ".popsection \n\t"			\
+			 : :  "i" (key) :  : label);\
+	} while (0)
+
+#endif /* __KERNEL__ */
+
+typedef u32 jump_label_t;
+
+struct jump_entry {
+	jump_label_t code;
+	jump_label_t target;
+	jump_label_t key;
+};
+
+#endif
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
index 727af70646c..6e8bfa1786d 100644
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -1,10 +1,6 @@
 #ifndef __ASM_SPARC_PERF_EVENT_H
 #define __ASM_SPARC_PERF_EVENT_H
 
-extern void set_perf_event_pending(void);
-
-#define	PERF_EVENT_INDEX_OFFSET	0
-
 #ifdef CONFIG_PERF_EVENTS
 #include <asm/ptrace.h>
 
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 0c2dc1f24a9..599398fbbc7 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -119,3 +119,5 @@ obj-$(CONFIG_COMPAT)    += $(audit--y)
 
 pc--$(CONFIG_PERF_EVENTS) := perf_event.o
 obj-$(CONFIG_SPARC64)	+= $(pc--y)
+
+obj-$(CONFIG_SPARC64)	+= jump_label.o
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
new file mode 100644
index 00000000000..ea2dafc93d7
--- /dev/null
+++ b/arch/sparc/kernel/jump_label.c
@@ -0,0 +1,47 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	u32 val;
+	u32 *insn = (u32 *) (unsigned long) entry->code;
+
+	if (type == JUMP_LABEL_ENABLE) {
+		s32 off = (s32)entry->target - (s32)entry->code;
+
+#ifdef CONFIG_SPARC64
+		/* ba,pt %xcc, . + (off << 2) */
+		val = 0x10680000 | ((u32) off >> 2);
+#else
+		/* ba . + (off << 2) */
+		val = 0x10800000 | ((u32) off >> 2);
+#endif
+	} else {
+		val = 0x01000000;
+	}
+
+	get_online_cpus();
+	mutex_lock(&text_mutex);
+	*insn = val;
+	flushi(insn);
+	mutex_unlock(&text_mutex);
+	put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+	u32 *insn_p = (u32 *) (unsigned long) addr;
+
+	*insn_p = 0x01000000;
+	flushi(insn_p);
+}
+
+#endif
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index f848aadf54d..ee3c7dde8d9 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -18,6 +18,9 @@
 #include <asm/spitfire.h>
 
 #ifdef CONFIG_SPARC64
+
+#include <linux/jump_label.h>
+
 static void *module_map(unsigned long size)
 {
 	struct vm_struct *area;
@@ -227,6 +230,9 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
+	/* make jump label nops */
+	jump_label_apply_nops(me);
+
 	/* Cheetah's I-cache is fully coherent.  */
 	if (tlb_type == spitfire) {
 		unsigned long va;
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index c4a6a50b484..b87873c0e8e 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/ftrace.h>
 
 #include <asm/pil.h>
@@ -43,14 +43,14 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
-#ifdef CONFIG_PERF_EVENTS
-	perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+	irq_work_run();
 #endif
 	irq_exit();
 	set_irq_regs(old_regs);
 }
 
-void set_perf_event_pending(void)
+void arch_irq_work_raise(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6318e622cfb..0d6deb55a2a 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -658,13 +658,16 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
 
 		enc = perf_event_get_enc(cpuc->events[i]);
 		pcr &= ~mask_for_index(idx);
-		pcr |= event_encoding(enc, idx);
+		if (hwc->state & PERF_HES_STOPPED)
+			pcr |= nop_for_index(idx);
+		else
+			pcr |= event_encoding(enc, idx);
 	}
 out:
 	return pcr;
 }
 
-void hw_perf_enable(void)
+static void sparc_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	u64 pcr;
@@ -691,7 +694,7 @@ void hw_perf_enable(void)
 	pcr_ops->write(cpuc->pcr);
 }
 
-void hw_perf_disable(void)
+static void sparc_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	u64 val;
@@ -710,19 +713,65 @@ void hw_perf_disable(void)
 	pcr_ops->write(cpuc->pcr);
 }
 
-static void sparc_pmu_disable(struct perf_event *event)
+static int active_event_index(struct cpu_hw_events *cpuc,
+			      struct perf_event *event)
+{
+	int i;
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		if (cpuc->event[i] == event)
+			break;
+	}
+	BUG_ON(i == cpuc->n_events);
+	return cpuc->current_idx[i];
+}
+
+static void sparc_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx = active_event_index(cpuc, event);
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		sparc_perf_event_set_period(event, &event->hw, idx);
+	}
+
+	event->hw.state = 0;
+
+	sparc_pmu_enable_event(cpuc, &event->hw, idx);
+}
+
+static void sparc_pmu_stop(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx = active_event_index(cpuc, event);
+
+	if (!(event->hw.state & PERF_HES_STOPPED)) {
+		sparc_pmu_disable_event(cpuc, &event->hw, idx);
+		event->hw.state |= PERF_HES_STOPPED;
+	}
+
+	if (!(event->hw.state & PERF_HES_UPTODATE) && (flags & PERF_EF_UPDATE)) {
+		sparc_perf_event_update(event, &event->hw, idx);
+		event->hw.state |= PERF_HES_UPTODATE;
+	}
+}
+
+static void sparc_pmu_del(struct perf_event *event, int _flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 	int i;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event[i]) {
-			int idx = cpuc->current_idx[i];
+			/* Absorb the final count and turn off the
+			 * event.
+			 */
+			sparc_pmu_stop(event, PERF_EF_UPDATE);
 
 			/* Shift remaining entries down into
 			 * the existing slot.
@@ -734,13 +783,6 @@ static void sparc_pmu_disable(struct perf_event *event)
 					cpuc->current_idx[i];
 			}
 
-			/* Absorb the final count and turn off the
-			 * event.
-			 */
-			sparc_pmu_disable_event(cpuc, hwc, idx);
-			barrier();
-			sparc_perf_event_update(event, hwc, idx);
-
 			perf_event_update_userpage(event);
 
 			cpuc->n_events--;
@@ -748,23 +790,10 @@ static void sparc_pmu_disable(struct perf_event *event)
 		}
 	}
 
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
-static int active_event_index(struct cpu_hw_events *cpuc,
-			      struct perf_event *event)
-{
-	int i;
-
-	for (i = 0; i < cpuc->n_events; i++) {
-		if (cpuc->event[i] == event)
-			break;
-	}
-	BUG_ON(i == cpuc->n_events);
-	return cpuc->current_idx[i];
-}
-
 static void sparc_pmu_read(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -774,15 +803,6 @@ static void sparc_pmu_read(struct perf_event *event)
 	sparc_perf_event_update(event, hwc, idx);
 }
 
-static void sparc_pmu_unthrottle(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	int idx = active_event_index(cpuc, event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	sparc_pmu_enable_event(cpuc, hwc, idx);
-}
-
 static atomic_t active_events = ATOMIC_INIT(0);
 static DEFINE_MUTEX(pmc_grab_mutex);
 
@@ -877,7 +897,7 @@ static int sparc_check_constraints(struct perf_event **evts,
 	if (!n_ev)
 		return 0;
 
-	if (n_ev > perf_max_events)
+	if (n_ev > MAX_HWEVENTS)
 		return -1;
 
 	msk0 = perf_event_get_msk(events[0]);
@@ -984,23 +1004,27 @@ static int collect_events(struct perf_event *group, int max_count,
 	return n;
 }
 
-static int sparc_pmu_enable(struct perf_event *event)
+static int sparc_pmu_add(struct perf_event *event, int ef_flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int n0, ret = -EAGAIN;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	n0 = cpuc->n_events;
-	if (n0 >= perf_max_events)
+	if (n0 >= MAX_HWEVENTS)
 		goto out;
 
 	cpuc->event[n0] = event;
 	cpuc->events[n0] = event->hw.event_base;
 	cpuc->current_idx[n0] = PIC_NO_INDEX;
 
+	event->hw.state = PERF_HES_UPTODATE;
+	if (!(ef_flags & PERF_EF_START))
+		event->hw.state |= PERF_HES_STOPPED;
+
 	/*
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be peformed
@@ -1020,12 +1044,12 @@ nocheck:
 
 	ret = 0;
 out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 	return ret;
 }
 
-static int __hw_perf_event_init(struct perf_event *event)
+static int sparc_pmu_event_init(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
 	struct perf_event *evts[MAX_HWEVENTS];
@@ -1038,22 +1062,33 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if (atomic_read(&nmi_active) < 0)
 		return -ENODEV;
 
-	pmap = NULL;
-	if (attr->type == PERF_TYPE_HARDWARE) {
+	switch (attr->type) {
+	case PERF_TYPE_HARDWARE:
 		if (attr->config >= sparc_pmu->max_events)
 			return -EINVAL;
 		pmap = sparc_pmu->event_map(attr->config);
-	} else if (attr->type == PERF_TYPE_HW_CACHE) {
+		break;
+
+	case PERF_TYPE_HW_CACHE:
 		pmap = sparc_map_cache_event(attr->config);
 		if (IS_ERR(pmap))
 			return PTR_ERR(pmap);
-	} else if (attr->type != PERF_TYPE_RAW)
-		return -EOPNOTSUPP;
+		break;
+
+	case PERF_TYPE_RAW:
+		pmap = NULL;
+		break;
+
+	default:
+		return -ENOENT;
+
+	}
 
 	if (pmap) {
 		hwc->event_base = perf_event_encode(pmap);
 	} else {
-		/* User gives us "(encoding << 16) | pic_mask" for
+		/*
+		 * User gives us "(encoding << 16) | pic_mask" for
 		 * PERF_TYPE_RAW events.
 		 */
 		hwc->event_base = attr->config;
@@ -1071,7 +1106,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	n = 0;
 	if (event->group_leader != event) {
 		n = collect_events(event->group_leader,
-				   perf_max_events - 1,
+				   MAX_HWEVENTS - 1,
 				   evts, events, current_idx_dmy);
 		if (n < 0)
 			return -EINVAL;
@@ -1107,10 +1142,11 @@ static int __hw_perf_event_init(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  */
-static void sparc_pmu_start_txn(const struct pmu *pmu)
+static void sparc_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
+	perf_pmu_disable(pmu);
 	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
@@ -1119,11 +1155,12 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
  * Clear the flag and pmu::enable() will perform the
  * schedulability test.
  */
-static void sparc_pmu_cancel_txn(const struct pmu *pmu)
+static void sparc_pmu_cancel_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
+	perf_pmu_enable(pmu);
 }
 
 /*
@@ -1131,7 +1168,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
  * Perform the group schedulability test as a whole
  * Return 0 if success
  */
-static int sparc_pmu_commit_txn(const struct pmu *pmu)
+static int sparc_pmu_commit_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int n;
@@ -1147,28 +1184,24 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
 		return -EAGAIN;
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
+	perf_pmu_enable(pmu);
 	return 0;
 }
 
-static const struct pmu pmu = {
-	.enable		= sparc_pmu_enable,
-	.disable	= sparc_pmu_disable,
+static struct pmu pmu = {
+	.pmu_enable	= sparc_pmu_enable,
+	.pmu_disable	= sparc_pmu_disable,
+	.event_init	= sparc_pmu_event_init,
+	.add		= sparc_pmu_add,
+	.del		= sparc_pmu_del,
+	.start		= sparc_pmu_start,
+	.stop		= sparc_pmu_stop,
 	.read		= sparc_pmu_read,
-	.unthrottle	= sparc_pmu_unthrottle,
 	.start_txn	= sparc_pmu_start_txn,
 	.cancel_txn	= sparc_pmu_cancel_txn,
 	.commit_txn	= sparc_pmu_commit_txn,
 };
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
-{
-	int err = __hw_perf_event_init(event);
-
-	if (err)
-		return ERR_PTR(err);
-	return &pmu;
-}
-
 void perf_event_print_debug(void)
 {
 	unsigned long flags;
@@ -1244,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 			continue;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			sparc_pmu_disable_event(cpuc, hwc, idx);
+			sparc_pmu_stop(event, 0);
 	}
 
 	return NOTIFY_STOP;
@@ -1285,28 +1318,21 @@ void __init init_hw_perf_events(void)
 
 	pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
 
-	/* All sparc64 PMUs currently have 2 events.  */
-	perf_max_events = 2;
-
+	perf_pmu_register(&pmu);
 	register_die_notifier(&perf_event_nmi_notifier);
 }
 
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
-		entry->ip[entry->nr++] = ip;
-}
-
-static void perf_callchain_kernel(struct pt_regs *regs,
-				  struct perf_callchain_entry *entry)
+void perf_callchain_kernel(struct perf_callchain_entry *entry,
+			   struct pt_regs *regs)
 {
 	unsigned long ksp, fp;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	int graph = 0;
 #endif
 
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->tpc);
+	stack_trace_flush();
+
+	perf_callchain_store(entry, regs->tpc);
 
 	ksp = regs->u_regs[UREG_I6];
 	fp = ksp + STACK_BIAS;
@@ -1330,13 +1356,13 @@ static void perf_callchain_kernel(struct pt_regs *regs,
 			pc = sf->callers_pc;
 			fp = (unsigned long)sf->fp + STACK_BIAS;
 		}
-		callchain_store(entry, pc);
+		perf_callchain_store(entry, pc);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 		if ((pc + 8UL) == (unsigned long) &return_to_handler) {
 			int index = current->curr_ret_stack;
 			if (current->ret_stack && index >= graph) {
 				pc = current->ret_stack[index - graph].ret;
-				callchain_store(entry, pc);
+				perf_callchain_store(entry, pc);
 				graph++;
 			}
 		}
@@ -1344,13 +1370,12 @@ static void perf_callchain_kernel(struct pt_regs *regs,
 	} while (entry->nr < PERF_MAX_STACK_DEPTH);
 }
 
-static void perf_callchain_user_64(struct pt_regs *regs,
-				   struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+				   struct pt_regs *regs)
 {
 	unsigned long ufp;
 
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, regs->tpc);
+	perf_callchain_store(entry, regs->tpc);
 
 	ufp = regs->u_regs[UREG_I6] + STACK_BIAS;
 	do {
@@ -1363,17 +1388,16 @@ static void perf_callchain_user_64(struct pt_regs *regs,
 
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
-		callchain_store(entry, pc);
+		perf_callchain_store(entry, pc);
 	} while (entry->nr < PERF_MAX_STACK_DEPTH);
 }
 
-static void perf_callchain_user_32(struct pt_regs *regs,
-				   struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+				   struct pt_regs *regs)
 {
 	unsigned long ufp;
 
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, regs->tpc);
+	perf_callchain_store(entry, regs->tpc);
 
 	ufp = regs->u_regs[UREG_I6] & 0xffffffffUL;
 	do {
@@ -1386,34 +1410,16 @@ static void perf_callchain_user_32(struct pt_regs *regs,
 
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp;
-		callchain_store(entry, pc);
+		perf_callchain_store(entry, pc);
 	} while (entry->nr < PERF_MAX_STACK_DEPTH);
 }
 
-/* Like powerpc we can't get PMU interrupts within the PMU handler,
- * so no need for separate NMI and IRQ chains as on x86.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
-	entry->nr = 0;
-	if (!user_mode(regs)) {
-		stack_trace_flush();
-		perf_callchain_kernel(regs, entry);
-		if (current->mm)
-			regs = task_pt_regs(current);
-		else
-			regs = NULL;
-	}
-	if (regs) {
-		flushw_user();
-		if (test_thread_flag(TIF_32BIT))
-			perf_callchain_user_32(regs, entry);
-		else
-			perf_callchain_user_64(regs, entry);
-	}
-	return entry;
+	flushw_user();
+	if (test_thread_flag(TIF_32BIT))
+		perf_callchain_user_32(entry, regs);
+	else
+		perf_callchain_user_64(entry, regs);
 }
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..8c9e609a175 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -25,6 +25,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_PERF_EVENTS if (!M386 && !M486)
+	select HAVE_IRQ_WORK
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -33,6 +34,7 @@ config X86
 	select HAVE_KRETPROBES
 	select HAVE_OPTPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_C_RECORDMCOUNT
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
@@ -59,6 +61,8 @@ config X86
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
+	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_TEXT_POKE_SMP
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -517,25 +521,6 @@ if PARAVIRT_GUEST
 
 source "arch/x86/xen/Kconfig"
 
-config VMI
-	bool "VMI Guest support (DEPRECATED)"
-	select PARAVIRT
-	depends on X86_32
-	---help---
-	  VMI provides a paravirtualized interface to the VMware ESX server
-	  (it could be used by other hypervisors in theory too, but is not
-	  at the moment), by linking the kernel to a GPL-ed ROM module
-	  provided by the hypervisor.
-
-	  As of September 2009, VMware has started a phased retirement
-	  of this feature from VMware's products. Please see
-	  feature-removal-schedule.txt for details.  If you are
-	  planning to enable this option, please note that you cannot
-	  live migrate a VMI enabled VM to a future VMware product,
-	  which doesn't support VMI. So if you expect your kernel to
-	  seamlessly migrate to newer VMware products, keep this
-	  disabled.
-
 config KVM_CLOCK
 	bool "KVM paravirtualized clock"
 	select PARAVIRT
@@ -670,7 +655,7 @@ config GART_IOMMU
 	bool "GART IOMMU support" if EMBEDDED
 	default y
 	select SWIOTLB
-	depends on X86_64 && PCI && K8_NB
+	depends on X86_64 && PCI && AMD_NB
 	---help---
 	  Support for full DMA access of devices with 32bit memory access only
 	  on systems with more than 3GB. This is usually needed for USB,
@@ -795,6 +780,17 @@ config SCHED_MC
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 
+config IRQ_TIME_ACCOUNTING
+	bool "Fine granularity task level IRQ time accounting"
+	default n
+	---help---
+	  Select this option to enable fine granularity task irq time
+	  accounting. This is done by reading a timestamp on each
+	  transitions between softirq and hardirq state, so there can be a
+	  small performance impact.
+
+	  If in doubt, say N here.
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
@@ -1148,6 +1144,9 @@ config X86_PAE
 config ARCH_PHYS_ADDR_T_64BIT
 	def_bool X86_64 || X86_PAE
 
+config ARCH_DMA_ADDR_T_64BIT
+	def_bool X86_64 || HIGHMEM64G
+
 config DIRECT_GBPAGES
 	bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
 	default y
@@ -1326,25 +1325,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
 	  Set whether the default state of memory_corruption_check is
 	  on or off.
 
-config X86_RESERVE_LOW_64K
-	bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
-	default y
+config X86_RESERVE_LOW
+	int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+	default 64
+	range 4 640
 	---help---
-	  Reserve the first 64K of physical RAM on BIOSes that are known
-	  to potentially corrupt that memory range. A numbers of BIOSes are
-	  known to utilize this area during suspend/resume, so it must not
-	  be used by the kernel.
+	  Specify the amount of low memory to reserve for the BIOS.
+
+	  The first page contains BIOS data structures that the kernel
+	  must not use, so that page must always be reserved.
+
+	  By default we reserve the first 64K of physical RAM, as a
+	  number of BIOSes are known to corrupt that memory range
+	  during events such as suspend/resume or monitor cable
+	  insertion, so it must not be used by the kernel.
 
-	  Set this to N if you are absolutely sure that you trust the BIOS
-	  to get all its memory reservations and usages right.
+	  You can set this to 4 if you are absolutely sure that you
+	  trust the BIOS to get all its memory reservations and usages
+	  right.  If you know your BIOS have problems beyond the
+	  default 64K area, you can set this to 640 to avoid using the
+	  entire low memory range.
 
-	  If you have doubts about the BIOS (e.g. suspend/resume does not
-	  work or there's kernel crashes after certain hardware hotplug
-	  events) and it's not AMI or Phoenix, then you might want to enable
-	  X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
-	  corruption patterns.
+	  If you have doubts about the BIOS (e.g. suspend/resume does
+	  not work or there's kernel crashes after certain hardware
+	  hotplug events) then you might want to enable
+	  X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+	  typical corruption patterns.
 
-	  Say Y if unsure.
+	  Leave this to the default value of 64 if you are unsure.
 
 config MATH_EMULATION
 	bool
@@ -1900,7 +1908,7 @@ config PCI_GODIRECT
 	bool "Direct"
 
 config PCI_GOOLPC
-	bool "OLPC"
+	bool "OLPC XO-1"
 	depends on OLPC
 
 config PCI_GOANY
@@ -2061,14 +2069,21 @@ config SCx200HR_TIMER
 config OLPC
 	bool "One Laptop Per Child support"
 	select GPIOLIB
+	select OLPC_OPENFIRMWARE
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
 
+config OLPC_XO1
+	tristate "OLPC XO-1 support"
+	depends on OLPC && PCI
+	---help---
+	  Add support for non-essential features of the OLPC XO-1 laptop.
+
 config OLPC_OPENFIRMWARE
 	bool "Support for OLPC's Open Firmware"
 	depends on !X86_64 && !X86_PAE
-	default y if OLPC
+	default n
 	help
 	  This option adds support for the implementation of Open Firmware
 	  that is used on the OLPC XO-1 Children's Machine.
@@ -2076,7 +2091,7 @@ config OLPC_OPENFIRMWARE
 
 endif # X86_32
 
-config K8_NB
+config AMD_NB
 	def_bool y
 	depends on CPU_SUP_AMD && PCI
 
@@ -2125,6 +2140,10 @@ config HAVE_ATOMIC_IOMAP
 	def_bool y
 	depends on X86_32
 
+config HAVE_TEXT_POKE_SMP
+	bool
+	select STOP_MACHINE if SMP
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63..e5bb96b10f1 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
 	  with klogd/syslogd or the X server. You should normally N here,
 	  unless you want to debug such a crash.
 
+config EARLY_PRINTK_MRST
+	bool "Early printk for MRST platform support"
+	depends on EARLY_PRINTK && X86_MRST
+
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
 	depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e8c8881351b..b02e509072a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
 # is .cfi_signal_frame supported too?
 cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
 cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+
+# does binutils support specific instructions?
+asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bc6abb7bc7e..76561d20ea2 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
+#include <linux/jump_label.h>
 #include <asm/asm.h>
 
 /*
@@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
 #define __parainstructions_end	NULL
 #endif
 
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
 /*
  * Clear and restore the kernel write-protection flag on the local CPU.
  * Allows the kernel to edit read-only pages.
@@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
 extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
 
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+#define IDEAL_NOP_SIZE_5 5
+extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+extern void arch_init_ideal_nop5(void);
+#else
+static inline void arch_init_ideal_nop5(void) {}
+#endif
+
 #endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b..f16a2caca1e 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90..916bc8111a0 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 08616180dea..e3509fc303b 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
@@ -416,13 +416,22 @@ struct amd_iommu {
 	struct dma_ops_domain *default_dom;
 
 	/*
-	 * This array is required to work around a potential BIOS bug.
-	 * The BIOS may miss to restore parts of the PCI configuration
-	 * space when the system resumes from S3. The result is that the
-	 * IOMMU does not execute commands anymore which leads to system
-	 * failure.
+	 * We can't rely on the BIOS to restore all values on reinit, so we
+	 * need to stash them
 	 */
-	u32 cache_cfg[4];
+
+	/* The iommu BAR */
+	u32 stored_addr_lo;
+	u32 stored_addr_hi;
+
+	/*
+	 * Each iommu has 6 l1s, each of which is documented as having 0x12
+	 * registers
+	 */
+	u32 stored_l1[6][0x12];
+
+	/* The l2 indirect registers */
+	u32 stored_l2[0x83];
 };
 
 /*
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h
index af00bd1d208..c8517f81b21 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_K8_H
-#define _ASM_X86_K8_H
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
 
 #include <linux/pci.h>
 
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[];
 struct bootnode;
 
 extern int early_is_k8_nb(u32 value);
-extern struct pci_dev **k8_northbridges;
-extern int num_k8_northbridges;
 extern int cache_k8_northbridges(void);
 extern void k8_flush_garts(void);
 extern int k8_get_nodes(struct bootnode *nodes);
 extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int k8_scan_nodes(void);
 
-#ifdef CONFIG_K8_NB
-extern int num_k8_northbridges;
+struct k8_northbridge_info {
+	u16 num;
+	u8 gart_supported;
+	struct pci_dev **nb_misc;
+};
+extern struct k8_northbridge_info k8_northbridges;
+
+#ifdef CONFIG_AMD_NB
 
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
-	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+	return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
 }
 
 #else
-#define num_k8_northbridges 0
 
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
@@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node)
 #endif
 
 
-#endif /* _ASM_X86_K8_H */
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index a69b1ac9eaf..2fefa501d3b 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event;
 extern unsigned long apbt_quick_calibrate(void);
 extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
 extern void apbt_setup_secondary_clock(void);
-extern unsigned int boot_cpu_id;
 
 extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
 extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19..4fab24de26b 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int);
 
 DECLARE_PER_CPU(int, cpu_state);
 
-extern unsigned int boot_cpu_id;
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3f76523589a..220e2ea08e8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -152,10 +152,14 @@
 #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
 #define X86_FEATURE_OSVW	(6*32+ 9) /* OS Visible Workaround */
 #define X86_FEATURE_IBS		(6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_SSE5	(6*32+11) /* SSE-5 */
+#define X86_FEATURE_XOP		(6*32+11) /* extended AVX instructions */
 #define X86_FEATURE_SKINIT	(6*32+12) /* SKINIT/STGI instructions */
 #define X86_FEATURE_WDT		(6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP		(6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4	(6*32+16) /* 4 operands MAC instructions */
 #define X86_FEATURE_NODEID_MSR	(6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM		(6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT	(6*32+22) /* topology extensions CPUID leafs */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -180,6 +184,13 @@
 #define X86_FEATURE_LBRV	(8*32+ 6) /* AMD LBR Virtualization support */
 #define X86_FEATURE_SVML	(8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
 #define X86_FEATURE_NRIPS	(8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a..32609919931 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
 	CFI_ADJUST_CFA_OFFSET -8
 	.endm
 
+	.macro pushfq_cfi
+	pushfq
+	CFI_ADJUST_CFA_OFFSET 8
+	.endm
+
+	.macro popfq_cfi
+	popfq
+	CFI_ADJUST_CFA_OFFSET -8
+	.endm
+
 	.macro movq_cfi reg offset=0
 	movq %\reg, \offset(%rsp)
 	CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
 	CFI_ADJUST_CFA_OFFSET -4
 	.endm
 
+	.macro pushfl_cfi
+	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
+	.endm
+
+	.macro popfl_cfi
+	popfl
+	CFI_ADJUST_CFA_OFFSET -4
+	.endm
+
 	.macro movl_cfi reg offset=0
 	movl %\reg, \offset(%esp)
 	CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98..b8e96a18676 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1d..4d293dced62 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
 	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
 	return __virt_to_fix(vaddr);
 }
+
+/* Return an pointer with offset calculated */
+static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
+				phys_addr_t phys, pgprot_t flags)
+{
+	__set_fixmap(idx, phys, flags);
+	return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys)			\
+	__set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys)			\
+	__set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc..bf357f9b25f 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
 #define GARTEN		(1<<0)
 #define DISGARTCPU	(1<<4)
 #define DISGARTIO	(1<<5)
+#define DISTLBWALKPRB	(1<<6)
 
 /* GART cache control register bits. */
 #define INVGART		(1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
 #define AMD64_GARTAPERTUREBASE	0x94
 #define AMD64_GARTTABLEBASE	0x98
 #define AMD64_GARTCACHECTL	0x9c
-#define AMD64_GARTEN		(1<<0)
 
 #ifdef CONFIG_GART_IOMMU
 extern int gart_iommu_aperture;
@@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
 
 extern int agp_amd64_init(void);
 
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+	u32 ctl;
+
+	/*
+	 * Don't enable translation but enable GART IO and CPU accesses.
+	 * Also, set DISTLBWALKPRB since GART tables memory is UC.
+	 */
+	ctl = DISTLBWALKPRB | order << 1;
+
+	pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
 static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
 {
 	u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index aeab29aee61..55e4de613f0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
-	unsigned int apic_pending_irqs;
+	unsigned int apic_irq_work_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 46c0fe05f23..3a54a1ca1a0 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
 extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e6..4aa2bb3b242 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
 extern int restore_i387_xstate_ia32(void __user *buf);
 #endif
 
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
 static __always_inline __pure bool use_xsaveopt(void)
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
 	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
+static __always_inline __pure bool use_fxsr(void)
+{
+        return static_cpu_has(X86_FEATURE_FXSR);
+}
+
 extern void __sanitize_i387_state(struct task_struct *);
 
 static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
-	asm volatile("1: fwait\n"
-		     "2:\n"
-		     _ASM_EXTABLE(1b, 2b));
-}
-
 static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
 	int err;
 
+	/* See comment in fxsave() below. */
 	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
@@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
-		     : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
-		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
+		     : [fx] "R" (fx), "m" (*fx), "0" (0));
 	return err;
 }
 
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
-   is pending. Clear the x87 state here by setting it to fixed
-   values. The kernel data segment can be sometimes 0 and sometimes
-   new user value. Both should be ok.
-   Use the PDA as safe address because it should be already in L1. */
-static inline void fpu_clear(struct fpu *fpu)
-{
-	struct xsave_struct *xstate = &fpu->state->xsave;
-	struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
-	/*
-	 * xsave header may indicate the init state of the FP.
-	 */
-	if (use_xsave() &&
-	    !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
-		return;
-
-	if (unlikely(fx->swd & X87_FSW_ES))
-		asm volatile("fnclex");
-	alternative_input(ASM_NOP8 ASM_NOP2,
-			  "    emms\n"		/* clear stack tags */
-			  "    fildl %%gs:0",	/* load to clear state */
-			  X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
-	fpu_clear(&tsk->thread.fpu);
-}
-
 static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
@@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 	if (unlikely(err))
 		return -EFAULT;
 
+	/* See comment in fxsave() below. */
 	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
@@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
-		     : [fx] "r" (fx), "0" (0));
-#else
-		     : [fx] "cdaSDb" (fx), "0" (0));
-#endif
+		     : [fx] "R" (fx), "0" (0));
 	if (unlikely(err) &&
 	    __clear_user(fx, sizeof(struct i387_fxsave_struct)))
 		err = -EFAULT;
@@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
 	   uses any extended registers for addressing, a second REX prefix
 	   will be generated (to the assembler, rex64 followed by semicolon
 	   is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
+
+#ifdef CONFIG_AS_FXSAVEQ
 	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
 	   starting with gas 2.16. */
 	__asm__ __volatile__("fxsaveq %0"
 			     : "=m" (fpu->state->fxsave));
-#elif 0
+#else
 	/* Using, as a workaround, the properly prefixed form below isn't
 	   accepted by any binutils version so far released, complaining that
 	   the same type of prefix is used twice if an extended register is
-	   needed for addressing (fix submitted to mainline 2005-11-21). */
-	__asm__ __volatile__("rex64/fxsave %0"
-			     : "=m" (fpu->state->fxsave));
-#else
-	/* This, however, we can work around by forcing the compiler to select
+	   needed for addressing (fix submitted to mainline 2005-11-21).
+	asm volatile("rex64/fxsave %0"
+		     : "=m" (fpu->state->fxsave));
+	   This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
-	__asm__ __volatile__("rex64/fxsave (%1)"
-			     : "=m" (fpu->state->fxsave)
-			     : "cdaSDb" (&fpu->state->fxsave));
+	asm volatile("rex64/fxsave (%[fx])"
+		     : "=m" (fpu->state->fxsave)
+		     : [fx] "R" (&fpu->state->fxsave));
 #endif
 }
 
-static inline void fpu_save_init(struct fpu *fpu)
-{
-	if (use_xsave())
-		fpu_xsave(fpu);
-	else
-		fpu_fxsave(fpu);
-
-	fpu_clear(fpu);
-}
-
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
-	fpu_save_init(&tsk->thread.fpu);
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
 #else  /* CONFIG_X86_32 */
 
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-static inline void tolerant_fwait(void)
-{
-	asm volatile("fnclex ; fwait");
-}
-
 /* perform fxrstor iff the processor has extended states, otherwise frstor */
 static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
@@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 	return 0;
 }
 
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+	asm volatile("fxsave %[fx]"
+		     : [fx] "=m" (fpu->state->fxsave));
+}
+
+#endif	/* CONFIG_X86_64 */
+
 /* We need a safe address that is cheap to find and that is already
    in L1 during context switch. The best choices are unfortunately
    different for UP and SMP */
@@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave()) {
-		struct xsave_struct *xstate = &fpu->state->xsave;
-		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
 		fpu_xsave(fpu);
 
 		/*
 		 * xsave header may indicate the init state of the FP.
 		 */
-		if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
-			goto end;
-
-		if (unlikely(fx->swd & X87_FSW_ES))
-			asm volatile("fnclex");
-
-		/*
-		 * we can do a simple return here or be paranoid :)
-		 */
-		goto clear_state;
+		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
+			return;
+	} else if (use_fxsr()) {
+		fpu_fxsave(fpu);
+	} else {
+		asm volatile("fsave %[fx]; fwait"
+			     : [fx] "=m" (fpu->state->fsave));
+		return;
 	}
 
-	/* Use more nops than strictly needed in case the compiler
-	   varies code */
-	alternative_input(
-		"fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
-		"fxsave %[fx]\n"
-		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
-		X86_FEATURE_FXSR,
-		[fx] "m" (fpu->state->fxsave),
-		[fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
+	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
+		asm volatile("fnclex");
+
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
 	   values. safe_address is a random variable that should be in L1 */
 	alternative_input(
-		GENERIC_NOP8 GENERIC_NOP2,
+		ASM_NOP8 ASM_NOP2,
 		"emms\n\t"	  	/* clear stack tags */
-		"fildl %[addr]", 	/* set F?P to defined value */
+		"fildl %P[addr]",	/* set F?P to defined value */
 		X86_FEATURE_FXSAVE_LEAK,
 		[addr] "m" (safe_address));
-end:
-	;
 }
 
 static inline void __save_init_fpu(struct task_struct *tsk)
@@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
-
-#endif	/* CONFIG_X86_64 */
-
 static inline int fpu_fxrstor_checking(struct fpu *fpu)
 {
 	return fxrstor_checking(&fpu->state->fxsave);
@@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
 static inline void __clear_fpu(struct task_struct *tsk)
 {
 	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		tolerant_fwait();
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	}
@@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state)
 		stts();
 }
 
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-	__save_init_fpu(tsk);
-	stts();
-}
-
-#define unlazy_fpu	__unlazy_fpu
-#define clear_fpu	__clear_fpu
-
-#else  /* CONFIG_X86_32 */
-
 /*
  * These disable preemption on their own and are safe
  */
@@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk)
 	preempt_enable();
 }
 
-#endif	/* CONFIG_X86_64 */
-
 /*
  * i387 state interaction
  */
@@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu);
 
 #endif /* __ASSEMBLY__ */
 
-#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
-#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
-
 #endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e977612..6a45ec41ec2 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 
 extern void iounmap(volatile void __iomem *addr);
 
+extern void set_iounmap_nonlazy(void);
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e224450..8d841505344 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,31 @@
 
 #define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
 
+#ifdef CONFIG_INTR_REMAP
+static inline void prepare_irte(struct irte *irte, int vector,
+			        unsigned int dest)
+{
+	memset(irte, 0, sizeof(*irte));
+
+	irte->present = 1;
+	irte->dst_mode = apic->irq_dest_mode;
+	/*
+	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+	 * actual level or edge trigger will be setup in the IO-APIC
+	 * RTE. This will help simplify level triggered irq migration.
+	 * For more details, see the comments (in io_apic.c) explainig IO-APIC
+	 * irq migration in the presence of interrupt-remapping.
+	*/
+	irte->trigger_mode = 0;
+	irte->dlvry_mode = apic->irq_delivery_mode;
+	irte->vector = vector;
+	irte->dest_id = IRTE_DEST(dest);
+	irte->redir_hint = 1;
+}
+#else
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+}
+#endif
+
 #endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e2ca3009255..6af0894dafb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
 #define X86_PLATFORM_IPI_VECTOR		0xed
 
 /*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
  */
-#define LOCAL_PENDING_VECTOR		0xec
+#define IRQ_WORK_VECTOR			0xec
 
 #define UV_BAU_MESSAGE			0xea
 
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 00000000000..f52d42e8058
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/nops.h>
+
+#define JUMP_LABEL_NOP_SIZE 5
+
+# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+# define JUMP_LABEL(key, label)					\
+	do {							\
+		asm goto("1:"					\
+			JUMP_LABEL_INITIAL_NOP			\
+			".pushsection __jump_table,  \"a\" \n\t"\
+			_ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
+			".popsection \n\t"			\
+			: :  "i" (key) :  : label);		\
+	} while (0)
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+typedef u64 jump_label_t;
+#else
+typedef u32 jump_label_t;
+#endif
+
+struct jump_entry {
+	jump_label_t code;
+	jump_label_t target;
+	jump_label_t key;
+};
+
+#endif
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 16350740edf..4a711a684b1 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,6 +10,9 @@
  */
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
+
+#include <linux/sfi.h>
+
 extern int pci_mrst_init(void);
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
@@ -26,7 +29,7 @@ enum mrst_cpu_type {
 };
 
 extern enum mrst_cpu_type __mrst_cpu_chip;
-static enum mrst_cpu_type mrst_identify_cpu(void)
+static inline enum mrst_cpu_type mrst_identify_cpu(void)
 {
 	return __mrst_cpu_chip;
 }
@@ -42,4 +45,9 @@ extern enum mrst_timer_options mrst_timer_options;
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
+extern struct console early_mrst_console;
+extern void mrst_early_console_init(void);
+
+extern struct console early_hsu_console;
+extern void hsu_early_console_init(void);
 #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 00000000000..bcdff997668
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#define MWAIT_SUBSTATE_MASK		0xf
+#define MWAIT_CSTATE_MASK		0xf
+#define MWAIT_SUBSTATE_SIZE		4
+#define MWAIT_MAX_NUM_CSTATES		8
+
+#define CPUID_MWAIT_LEAF		5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK	0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK	0x1
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 08fde475cb3..2a8478140bb 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void);
 /* install OFW's pde permanently into the kernel's pgtable */
 extern void setup_olpc_ofw_pgd(void);
 
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
 #else /* !CONFIG_OLPC_OPENFIRMWARE */
 
 static inline void olpc_ofw_detect(void) { }
 static inline void setup_olpc_ofw_pgd(void) { }
+static inline bool olpc_ofw_present(void) { return false; }
 
 #endif /* !CONFIG_OLPC_OPENFIRMWARE */
 
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c725..1df66211fd1 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
 #define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 
-#define __PHYSICAL_MASK		((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __PHYSICAL_MASK		((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
 #define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
 
 /* Cast PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e..edecb4ed221 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
 	PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
 }
 
-static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
-					    unsigned long start, unsigned long count)
-{
-	PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
-}
 static inline void paravirt_release_pmd(unsigned long pfn)
 {
 	PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef553234..b82bac97525 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
 	 */
 	void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
 	void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
-	void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
 	void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
 	void (*release_pte)(unsigned long pfn);
 	void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index def500776b1..a70cd216be5 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -36,19 +36,6 @@
 #define P4_ESCR_EMASK(v)	((v) << P4_ESCR_EVENTMASK_SHIFT)
 #define P4_ESCR_TAG(v)		((v) << P4_ESCR_TAG_SHIFT)
 
-/* Non HT mask */
-#define P4_ESCR_MASK			\
-	(P4_ESCR_EVENT_MASK	|	\
-	P4_ESCR_EVENTMASK_MASK	|	\
-	P4_ESCR_TAG_MASK	|	\
-	P4_ESCR_TAG_ENABLE	|	\
-	P4_ESCR_T0_OS		|	\
-	P4_ESCR_T0_USR)
-
-/* HT mask */
-#define P4_ESCR_MASK_HT			\
-	(P4_ESCR_MASK |	P4_ESCR_T1_OS | P4_ESCR_T1_USR)
-
 #define P4_CCCR_OVF			0x80000000U
 #define P4_CCCR_CASCADE			0x40000000U
 #define P4_CCCR_OVF_PMI_T0		0x04000000U
@@ -70,23 +57,6 @@
 #define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
 #define P4_CCCR_ESEL(v)			((v) << P4_CCCR_ESCR_SELECT_SHIFT)
 
-/* Non HT mask */
-#define P4_CCCR_MASK				\
-	(P4_CCCR_OVF			|	\
-	P4_CCCR_CASCADE			|	\
-	P4_CCCR_OVF_PMI_T0		|	\
-	P4_CCCR_FORCE_OVF		|	\
-	P4_CCCR_EDGE			|	\
-	P4_CCCR_THRESHOLD_MASK		|	\
-	P4_CCCR_COMPLEMENT		|	\
-	P4_CCCR_COMPARE			|	\
-	P4_CCCR_ESCR_SELECT_MASK	|	\
-	P4_CCCR_ENABLE)
-
-/* HT mask */
-#define P4_CCCR_MASK_HT				\
-	(P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
-
 #define P4_GEN_ESCR_EMASK(class, name, bit)	\
 	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
 #define P4_ESCR_EMASK_BIT(class, name)		class##__##name
@@ -127,6 +97,28 @@
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR		\
+	P4_ESCR_EVENT_MASK	|	\
+	P4_ESCR_EVENTMASK_MASK	|	\
+	P4_ESCR_TAG_MASK	|	\
+	P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR		\
+	P4_CCCR_EDGE		|	\
+	P4_CCCR_THRESHOLD_MASK	|	\
+	P4_CCCR_COMPLEMENT	|	\
+	P4_CCCR_COMPARE		|	\
+	P4_CCCR_THREAD_ANY	|	\
+	P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK				  	  \
+	(p4_config_pack_escr(P4_CONFIG_MASK_ESCR))	| \
+	(p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
 static inline bool p4_is_event_cascaded(u64 config)
 {
 	u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a6..ada823a13c7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	pte_update(mm, addr, ptep);
 }
 
+#define flush_tlb_fix_spurious_fault(vma, address)
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 076052cd62b..f96ac9bedf7 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
 	native_set_pgd(pgd, native_make_pgd(0));
 }
 
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbeba..cae9c3cb95c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
 	u16			phys_proc_id;
 	/* Core id: */
 	u16			cpu_core_id;
+	/* Compute unit id */
+	u8			compute_unit_id;
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
@@ -602,7 +604,7 @@ extern unsigned long		mmu_cr4_features;
 
 static inline void set_in_cr4(unsigned long mask)
 {
-	unsigned cr4;
+	unsigned long cr4;
 
 	mmu_cr4_features |= mask;
 	cr4 = read_cr4();
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask)
 
 static inline void clear_in_cr4(unsigned long mask)
 {
-	unsigned cr4;
+	unsigned long cr4;
 
 	mmu_cr4_features &= ~mask;
 	cr4 = read_cr4();
@@ -764,29 +766,6 @@ extern unsigned long		idle_halt;
 extern unsigned long		idle_nomwait;
 extern bool			c1e_detected;
 
-/*
- * on systems with caches, caches must be flashed as the absolute
- * last instruction before going into a suspended halt.  Otherwise,
- * dirty data can linger in the cache and become stale on resume,
- * leading to strange errors.
- *
- * perform a variety of operations to guarantee that the compiler
- * will not reorder instructions.  wbinvd itself is serializing
- * so the processor will not reorder.
- *
- * Systems without cache can just go into halt.
- */
-static inline void wbinvd_halt(void)
-{
-	mb();
-	/* check for clflush to determine if wbinvd is legal */
-	if (cpu_has_clflush)
-		asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
-	else
-		while (1)
-			halt();
-}
-
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d7..d6763b139a8 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
 			: : "i" (sz));					\
 	}
 
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries)		\
+	type *name;					\
+	RESERVE_BRK(name, sizeof(type) * entries)
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a290..00000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * VMI interface definition
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Maintained by: Zachary Amsden zach@vmware.com
- *
- */
-#include <linux/types.h>
-
-/*
- *---------------------------------------------------------------------
- *
- *  VMI Option ROM API
- *
- *---------------------------------------------------------------------
- */
-#define VMI_SIGNATURE 0x696d5663   /* "cVmi" */
-
-#define PCI_VENDOR_ID_VMWARE            0x15AD
-#define PCI_DEVICE_ID_VMWARE_VMI        0x0801
-
-/*
- * We use two version numbers for compatibility, with the major
- * number signifying interface breakages, and the minor number
- * interface extensions.
- */
-#define VMI_API_REV_MAJOR       3
-#define VMI_API_REV_MINOR       0
-
-#define VMI_CALL_CPUID			0
-#define VMI_CALL_WRMSR			1
-#define VMI_CALL_RDMSR			2
-#define VMI_CALL_SetGDT			3
-#define VMI_CALL_SetLDT			4
-#define VMI_CALL_SetIDT			5
-#define VMI_CALL_SetTR			6
-#define VMI_CALL_GetGDT			7
-#define VMI_CALL_GetLDT			8
-#define VMI_CALL_GetIDT			9
-#define VMI_CALL_GetTR			10
-#define VMI_CALL_WriteGDTEntry		11
-#define VMI_CALL_WriteLDTEntry		12
-#define VMI_CALL_WriteIDTEntry		13
-#define VMI_CALL_UpdateKernelStack	14
-#define VMI_CALL_SetCR0			15
-#define VMI_CALL_SetCR2			16
-#define VMI_CALL_SetCR3			17
-#define VMI_CALL_SetCR4			18
-#define VMI_CALL_GetCR0			19
-#define VMI_CALL_GetCR2			20
-#define VMI_CALL_GetCR3			21
-#define VMI_CALL_GetCR4			22
-#define VMI_CALL_WBINVD			23
-#define VMI_CALL_SetDR			24
-#define VMI_CALL_GetDR			25
-#define VMI_CALL_RDPMC			26
-#define VMI_CALL_RDTSC			27
-#define VMI_CALL_CLTS			28
-#define VMI_CALL_EnableInterrupts	29
-#define VMI_CALL_DisableInterrupts	30
-#define VMI_CALL_GetInterruptMask	31
-#define VMI_CALL_SetInterruptMask	32
-#define VMI_CALL_IRET			33
-#define VMI_CALL_SYSEXIT		34
-#define VMI_CALL_Halt			35
-#define VMI_CALL_Reboot			36
-#define VMI_CALL_Shutdown		37
-#define VMI_CALL_SetPxE			38
-#define VMI_CALL_SetPxELong		39
-#define VMI_CALL_UpdatePxE		40
-#define VMI_CALL_UpdatePxELong		41
-#define VMI_CALL_MachineToPhysical	42
-#define VMI_CALL_PhysicalToMachine	43
-#define VMI_CALL_AllocatePage		44
-#define VMI_CALL_ReleasePage		45
-#define VMI_CALL_InvalPage		46
-#define VMI_CALL_FlushTLB		47
-#define VMI_CALL_SetLinearMapping	48
-
-#define VMI_CALL_SetIOPLMask		61
-#define VMI_CALL_SetInitialAPState	62
-#define VMI_CALL_APICWrite		63
-#define VMI_CALL_APICRead		64
-#define VMI_CALL_IODelay		65
-#define VMI_CALL_SetLazyMode		73
-
-/*
- *---------------------------------------------------------------------
- *
- * MMU operation flags
- *
- *---------------------------------------------------------------------
- */
-
-/* Flags used by VMI_{Allocate|Release}Page call */
-#define VMI_PAGE_PAE             0x10  /* Allocate PAE shadow */
-#define VMI_PAGE_CLONE           0x20  /* Clone from another shadow */
-#define VMI_PAGE_ZEROED          0x40  /* Page is pre-zeroed */
-
-
-/* Flags shared by Allocate|Release Page and PTE updates */
-#define VMI_PAGE_PT              0x01
-#define VMI_PAGE_PD              0x02
-#define VMI_PAGE_PDP             0x04
-#define VMI_PAGE_PML4            0x08
-
-#define VMI_PAGE_NORMAL          0x00 /* for debugging */
-
-/* Flags used by PTE updates */
-#define VMI_PAGE_CURRENT_AS      0x10 /* implies VMI_PAGE_VA_MASK is valid */
-#define VMI_PAGE_DEFER           0x20 /* may queue update until TLB inval */
-#define VMI_PAGE_VA_MASK         0xfffff000
-
-#ifdef CONFIG_X86_PAE
-#define VMI_PAGE_L1		(VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2		(VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#else
-#define VMI_PAGE_L1		(VMI_PAGE_PT | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2		(VMI_PAGE_PD | VMI_PAGE_ZEROED)
-#endif
-
-/* Flags used by VMI_FlushTLB call */
-#define VMI_FLUSH_TLB            0x01
-#define VMI_FLUSH_GLOBAL         0x02
-
-/*
- *---------------------------------------------------------------------
- *
- *  VMI relocation definitions for ROM call get_reloc
- *
- *---------------------------------------------------------------------
- */
-
-/* VMI Relocation types */
-#define VMI_RELOCATION_NONE     0
-#define VMI_RELOCATION_CALL_REL 1
-#define VMI_RELOCATION_JUMP_REL 2
-#define VMI_RELOCATION_NOP	3
-
-#ifndef __ASSEMBLY__
-struct vmi_relocation_info {
-	unsigned char           *eip;
-	unsigned char           type;
-	unsigned char           reserved[3];
-};
-#endif
-
-
-/*
- *---------------------------------------------------------------------
- *
- *  Generic ROM structures and definitions
- *
- *---------------------------------------------------------------------
- */
-
-#ifndef __ASSEMBLY__
-
-struct vrom_header {
-	u16     rom_signature;  /* option ROM signature */
-	u8      rom_length;     /* ROM length in 512 byte chunks */
-	u8      rom_entry[4];   /* 16-bit code entry point */
-	u8      rom_pad0;       /* 4-byte align pad */
-	u32     vrom_signature; /* VROM identification signature */
-	u8      api_version_min;/* Minor version of API */
-	u8      api_version_maj;/* Major version of API */
-	u8      jump_slots;     /* Number of jump slots */
-	u8      reserved1;      /* Reserved for expansion */
-	u32     virtual_top;    /* Hypervisor virtual address start */
-	u16     reserved2;      /* Reserved for expansion */
-	u16	license_offs;	/* Offset to License string */
-	u16     pci_header_offs;/* Offset to PCI OPROM header */
-	u16     pnp_header_offs;/* Offset to PnP OPROM header */
-	u32     rom_pad3;       /* PnP reserverd / VMI reserved */
-	u8      reserved[96];   /* Reserved for headers */
-	char    vmi_init[8];    /* VMI_Init jump point */
-	char    get_reloc[8];   /* VMI_GetRelocationInfo jump point */
-} __attribute__((packed));
-
-struct pnp_header {
-	char sig[4];
-	char rev;
-	char size;
-	short next;
-	short res;
-	long devID;
-	unsigned short manufacturer_offset;
-	unsigned short product_offset;
-} __attribute__((packed));
-
-struct pci_header {
-	char sig[4];
-	short vendorID;
-	short deviceID;
-	short vpdData;
-	short size;
-	char rev;
-	char class;
-	char subclass;
-	char interface;
-	short chunks;
-	char rom_version_min;
-	char rom_version_maj;
-	char codetype;
-	char lastRom;
-	short reserved;
-} __attribute__((packed));
-
-/* Function prototypes for bootstrapping */
-#ifdef CONFIG_VMI
-extern void vmi_init(void);
-extern void vmi_activate(void);
-extern void vmi_bringup(void);
-#else
-static inline void vmi_init(void) {}
-static inline void vmi_activate(void) {}
-static inline void vmi_bringup(void) {}
-#endif
-
-/* State needed to start an application processor in an SMP system. */
-struct vmi_ap_state {
-	u32 cr0;
-	u32 cr2;
-	u32 cr3;
-	u32 cr4;
-
-	u64 efer;
-
-	u32 eip;
-	u32 eflags;
-	u32 eax;
-	u32 ebx;
-	u32 ecx;
-	u32 edx;
-	u32 esp;
-	u32 ebp;
-	u32 esi;
-	u32 edi;
-	u16 cs;
-	u16 ss;
-	u16 ds;
-	u16 es;
-	u16 fs;
-	u16 gs;
-	u16 ldtr;
-
-	u16 gdtr_limit;
-	u32 gdtr_base;
-	u32 idtr_base;
-	u16 idtr_limit;
-};
-
-#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3..00000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * VMI Time wrappers
- *
- * Copyright (C) 2006, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-#ifndef _ASM_X86_VMI_TIME_H
-#define _ASM_X86_VMI_TIME_H
-
-/*
- * Raw VMI call indices for timer functions
- */
-#define VMI_CALL_GetCycleFrequency	66
-#define VMI_CALL_GetCycleCounter	67
-#define VMI_CALL_SetAlarm		68
-#define VMI_CALL_CancelAlarm		69
-#define VMI_CALL_GetWallclockTime	70
-#define VMI_CALL_WallclockUpdated	71
-
-/* Cached VMI timer operations */
-extern struct vmi_timer_ops {
-	u64 (*get_cycle_frequency)(void);
-	u64 (*get_cycle_counter)(int);
-	u64 (*get_wallclock)(void);
-	int (*wallclock_updated)(void);
-	void (*set_alarm)(u32 flags, u64 expiry, u64 period);
-	void (*cancel_alarm)(u32 flags);
-} vmi_timer_ops;
-
-/* Prototypes */
-extern void __init vmi_time_init(void);
-extern unsigned long vmi_get_wallclock(void);
-extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_tsc_khz(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern void __devinit vmi_time_bsp_init(void);
-extern void __devinit vmi_time_ap_init(void);
-#endif
-
-/*
- * When run under a hypervisor, a vcpu is always in one of three states:
- * running, halted, or ready.  The vcpu is in the 'running' state if it
- * is executing.  When the vcpu executes the halt interface, the vcpu
- * enters the 'halted' state and remains halted until there is some work
- * pending for the vcpu (e.g. an alarm expires, host I/O completes on
- * behalf of virtual I/O).  At this point, the vcpu enters the 'ready'
- * state (waiting for the hypervisor to reschedule it).  Finally, at any
- * time when the vcpu is not in the 'running' state nor the 'halted'
- * state, it is in the 'ready' state.
- *
- * Real time is advances while the vcpu is 'running', 'ready', or
- * 'halted'.  Stolen time is the time in which the vcpu is in the
- * 'ready' state.  Available time is the remaining time -- the vcpu is
- * either 'running' or 'halted'.
- *
- * All three views of time are accessible through the VMI cycle
- * counters.
- */
-
-/* The cycle counters. */
-#define VMI_CYCLES_REAL         0
-#define VMI_CYCLES_AVAILABLE    1
-#define VMI_CYCLES_STOLEN       2
-
-/* The alarm interface 'flags' bits */
-#define VMI_ALARM_COUNTERS      2
-
-#define VMI_ALARM_COUNTER_MASK  0x000000ff
-
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-
-#define VMI_ALARM_IS_ONESHOT    0x00000000
-#define VMI_ALARM_IS_PERIODIC   0x00000100
-
-#define CONFIG_VMI_ALARM_HZ	100
-
-#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fedf32a8c3e..80a93dc9907 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -34,7 +34,8 @@ GCOV_PROFILE_paravirt.o		:= n
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
-obj-y			+= setup.o x86_init.o i8259.o irqinit.o
+obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
@@ -85,15 +86,15 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 
-obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_AMD_NB)		+= amd_nb.o
 obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 
-obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
@@ -106,6 +107,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y			+= scx200_32.o
 
 obj-$(CONFIG_OLPC)		+= olpc.o
+obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
 obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 
@@ -122,7 +124,6 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
-	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
 	obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59b..5812404a0d4 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
 
 #include <acpi/processor.h>
 #include <asm/acpi.h>
+#include <asm/mwait.h>
 
 /*
  * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry;	/* per CPU ptr */
 
 static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
 
-#define MWAIT_SUBSTATE_MASK	(0xf)
-#define MWAIT_CSTATE_MASK	(0xf)
-#define MWAIT_SUBSTATE_SIZE	(4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK	(0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK	(0x1)
-
 #define NATIVE_CSTATE_BEYOND_HALT	(2)
 
 static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f65ab8b014c..a36bb90aef5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
    This runs before SMP is initialized to avoid SMP problems with
@@ -522,7 +522,7 @@ void __init alternative_instructions(void)
  * instructions. And on the local CPU you need to be protected again NMI or MCE
  * handlers seeing an inconsistent instruction while you patch.
  */
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
 					      size_t len)
 {
 	unsigned long flags;
@@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 	tpp.len = len;
 	atomic_set(&stop_machine_first, 1);
 	wrote_text = 0;
-	stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+	/* Use __stop_machine() because the caller already got online_cpus. */
+	__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
 	return addr;
 }
 
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+
+unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+
+void __init arch_init_ideal_nop5(void)
+{
+	extern const unsigned char ftrace_test_p6nop[];
+	extern const unsigned char ftrace_test_nop5[];
+	extern const unsigned char ftrace_test_jmp[];
+	int faulted = 0;
+
+	/*
+	 * There is no good nop for all x86 archs.
+	 * We will default to using the P6_NOP5, but first we
+	 * will test to make sure that the nop will actually
+	 * work on this CPU. If it faults, we will then
+	 * go to a lesser efficient 5 byte nop. If that fails
+	 * we then just use a jmp as our nop. This isn't the most
+	 * efficient nop, but we can not use a multi part nop
+	 * since we would then risk being preempted in the middle
+	 * of that nop, and if we enabled tracing then, it might
+	 * cause a system crash.
+	 *
+	 * TODO: check the cpuid to determine the best nop.
+	 */
+	asm volatile (
+		"ftrace_test_jmp:"
+		"jmp ftrace_test_p6nop\n"
+		"nop\n"
+		"nop\n"
+		"nop\n"  /* 2 byte jmp + 3 bytes */
+		"ftrace_test_p6nop:"
+		P6_NOP5
+		"jmp 1f\n"
+		"ftrace_test_nop5:"
+		".byte 0x66,0x66,0x66,0x66,0x90\n"
+		"1:"
+		".section .fixup, \"ax\"\n"
+		"2:	movl $1, %0\n"
+		"	jmp ftrace_test_nop5\n"
+		"3:	movl $2, %0\n"
+		"	jmp 1b\n"
+		".previous\n"
+		_ASM_EXTABLE(ftrace_test_p6nop, 2b)
+		_ASM_EXTABLE(ftrace_test_nop5, 3b)
+		: "=r"(faulted) : "0" (faulted));
+
+	switch (faulted) {
+	case 0:
+		pr_info("converting mcount calls to 0f 1f 44 00 00\n");
+		memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5);
+		break;
+	case 1:
+		pr_info("converting mcount calls to 66 66 66 66 90\n");
+		memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5);
+		break;
+	case 2:
+		pr_info("converting mcount calls to jmp . + 5\n");
+		memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5);
+		break;
+	}
+
+}
+#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382..d2fdb0826df 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed..3cb482e123d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
 	return 1UL << shift;
 }
 
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+	pci_read_config_dword(iommu->dev, 0xfc, &val);
+	return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+	pci_write_config_dword(iommu->dev, 0xfc, val);
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf0, address);
+	pci_read_config_dword(iommu->dev, 0xf4, &val);
+	return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+	pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
 /****************************************************************************
  *
  * AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
 	int cap_ptr = iommu->cap_ptr;
 	u32 range, misc;
+	int i, j;
 
 	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
 			      &iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 					MMIO_GET_LD(range));
 	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
 
-	if (is_rd890_iommu(iommu->dev)) {
-		pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
-		pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
-		pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
-		pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
-	}
+	if (!is_rd890_iommu(iommu->dev))
+		return;
+
+	/*
+	 * Some rd890 systems may not be fully reconfigured by the BIOS, so
+	 * it's necessary for us to store this information so it can be
+	 * reprogrammed on resume
+	 */
+
+	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			      &iommu->stored_addr_lo);
+	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+			      &iommu->stored_addr_hi);
+
+	/* Low bit locks writes to configuration space */
+	iommu->stored_addr_lo &= ~1;
+
+	for (i = 0; i < 6; i++)
+		for (j = 0; j < 0x12; j++)
+			iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+	for (i = 0; i < 0x83; i++)
+		iommu->stored_l2[i] = iommu_read_l2(iommu, i);
 }
 
 /*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
 }
 
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
 {
-	if (is_rd890_iommu(iommu->dev)) {
-		pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
-		pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
-		pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
-		pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
-	}
+	int i, j;
+	u32 ioc_feature_control;
+	struct pci_dev *pdev = NULL;
+
+	/* RD890 BIOSes may not have completely reconfigured the iommu */
+	if (!is_rd890_iommu(iommu->dev))
+		return;
+
+	/*
+	 * First, we need to ensure that the iommu is enabled. This is
+	 * controlled by a register in the northbridge
+	 */
+	pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+	if (!pdev)
+		return;
+
+	/* Select Northbridge indirect register 0x75 and enable writing */
+	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+	pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+	/* Enable the iommu */
+	if (!(ioc_feature_control & 0x1))
+		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+	pci_dev_put(pdev);
+
+	/* Restore the iommu BAR */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo);
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+			       iommu->stored_addr_hi);
+
+	/* Restore the l1 indirect regs for each of the 6 l1s */
+	for (i = 0; i < 6; i++)
+		for (j = 0; j < 0x12; j++)
+			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+	/* Restore the l2 indirect regs */
+	for (i = 0; i < 0x83; i++)
+		iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+	/* Lock PCI setup registers */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo | 1);
 }
 
 /*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
 
 	for_each_iommu(iommu) {
 		iommu_disable(iommu);
-		iommu_apply_quirks(iommu);
 		iommu_init_flags(iommu);
 		iommu_set_device_table(iommu);
 		iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
 
 static int amd_iommu_resume(struct sys_device *dev)
 {
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_apply_resume_quirks(iommu);
+
 	/* re-load the hardware */
 	enable_iommus();
 
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c
index 0f7bc20cfcd..8f6463d8ed0 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -8,21 +8,19 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <asm/k8.h>
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
+#include <asm/amd_nb.h>
 
 static u32 *flush_words;
 
 struct pci_device_id k8_nb_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
 	{}
 };
 EXPORT_SYMBOL(k8_nb_ids);
 
-struct pci_dev **k8_northbridges;
+struct k8_northbridge_info k8_northbridges;
 EXPORT_SYMBOL(k8_northbridges);
 
 static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
@@ -40,36 +38,45 @@ int cache_k8_northbridges(void)
 	int i;
 	struct pci_dev *dev;
 
-	if (num_k8_northbridges)
+	if (k8_northbridges.num)
 		return 0;
 
 	dev = NULL;
 	while ((dev = next_k8_northbridge(dev)) != NULL)
-		num_k8_northbridges++;
+		k8_northbridges.num++;
+
+	/* some CPU families (e.g. family 0x11) do not support GART */
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+	    boot_cpu_data.x86 == 0x15)
+		k8_northbridges.gart_supported = 1;
 
-	k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
-				  GFP_KERNEL);
-	if (!k8_northbridges)
+	k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+					  sizeof(void *), GFP_KERNEL);
+	if (!k8_northbridges.nb_misc)
 		return -ENOMEM;
 
-	if (!num_k8_northbridges) {
-		k8_northbridges[0] = NULL;
+	if (!k8_northbridges.num) {
+		k8_northbridges.nb_misc[0] = NULL;
 		return 0;
 	}
 
-	flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
-	if (!flush_words) {
-		kfree(k8_northbridges);
-		return -ENOMEM;
+	if (k8_northbridges.gart_supported) {
+		flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+				      GFP_KERNEL);
+		if (!flush_words) {
+			kfree(k8_northbridges.nb_misc);
+			return -ENOMEM;
+		}
 	}
 
 	dev = NULL;
 	i = 0;
 	while ((dev = next_k8_northbridge(dev)) != NULL) {
-		k8_northbridges[i] = dev;
-		pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+		k8_northbridges.nb_misc[i] = dev;
+		if (k8_northbridges.gart_supported)
+			pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
 	}
-	k8_northbridges[i] = NULL;
+	k8_northbridges.nb_misc[i] = NULL;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cache_k8_northbridges);
@@ -93,22 +100,25 @@ void k8_flush_garts(void)
 	unsigned long flags;
 	static DEFINE_SPINLOCK(gart_lock);
 
+	if (!k8_northbridges.gart_supported)
+		return;
+
 	/* Avoid races between AGP and IOMMU. In theory it's not needed
 	   but I'm not sure if the hardware won't lose flush requests
 	   when another is pending. This whole thing is so expensive anyways
 	   that it doesn't matter to serialize more. -AK */
 	spin_lock_irqsave(&gart_lock, flags);
 	flushed = 0;
-	for (i = 0; i < num_k8_northbridges; i++) {
-		pci_write_config_dword(k8_northbridges[i], 0x9c,
+	for (i = 0; i < k8_northbridges.num; i++) {
+		pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
 				       flush_words[i]|1);
 		flushed++;
 	}
-	for (i = 0; i < num_k8_northbridges; i++) {
+	for (i = 0; i < k8_northbridges.num; i++) {
 		u32 w;
 		/* Make sure the hardware actually executed the flush*/
 		for (;;) {
-			pci_read_config_dword(k8_northbridges[i],
+			pci_read_config_dword(k8_northbridges.nb_misc[i],
 					      0x9c, &w);
 			if (!(w & 1))
 				break;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5..6fe2b5cb4f3 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -343,7 +343,7 @@ void apbt_setup_secondary_clock(void)
 
 	/* Don't register boot CPU clockevent */
 	cpu = smp_processor_id();
-	if (cpu == boot_cpu_id)
+	if (!cpu)
 		return;
 	/*
 	 * We need to calculate the scaled math multiplication factor for
@@ -398,7 +398,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 		}
 		break;
 	default:
-		pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+		pr_debug("APBT notified %lu, no action\n", action);
 	}
 	return NOTIFY_OK;
 }
@@ -552,7 +552,7 @@ bad_count:
 		pr_debug("APB CS going back %lx:%lx:%lx ",
 			 t2, last_read, t2 - last_read);
 bad_count_x3:
-		pr_debug(KERN_INFO "tripple check enforced\n");
+		pr_debug("triple check enforced\n");
 		t0 = apbt_readl(phy_cs_timer_id,
 				APBTMR_N_CURRENT_VALUE);
 		udelay(1);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e1..377f5db3b8b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
 #include <asm/gart.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/x86_init.h>
 
 int gart_iommu_aperture;
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
 				continue;
 
 			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-			aper_enabled = ctl & AMD64_GARTEN;
+			aper_enabled = ctl & GARTEN;
 			aper_order = (ctl >> 1) & 7;
 			aper_size = (32 * 1024 * 1024) << aper_order;
 			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
 				continue;
 
 			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-			ctl &= ~AMD64_GARTEN;
+			ctl &= ~GARTEN;
 			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
 		}
 	}
@@ -505,8 +505,13 @@ out:
 
 	/* Fix up the north bridges */
 	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
-		int bus;
-		int dev_base, dev_limit;
+		int bus, dev_base, dev_limit;
+
+		/*
+		 * Don't enable translation yet but enable GART IO and CPU
+		 * accesses and set DISTLBWALKPRB since GART table memory is UC.
+		 */
+		u32 ctl = DISTLBWALKPRB | aper_order << 1;
 
 		bus = bus_dev_ranges[i].bus;
 		dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ out:
 			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
 				continue;
 
-			/* Don't enable translation yet. That is done later.
-			   Assume this BIOS didn't initialise the GART so
-			   just overwrite all previous bits */
-			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
 			write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
 		}
 	}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49..8cf86fb3b4e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1665,10 +1665,7 @@ int __init APIC_init_uniprocessor(void)
 	}
 #endif
 
-#ifndef CONFIG_SMP
-	enable_IR_x2apic();
 	default_setup_apic_routing();
-#endif
 
 	verify_local_APIC();
 	connect_bsp_APIC();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c5b8f3dddb..9508811e844 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -162,7 +162,7 @@ int __init arch_early_irq_init(void)
 
 	cfg = irq_cfgx;
 	count = ARRAY_SIZE(irq_cfgx);
-	node= cpu_to_node(boot_cpu_id);
+	node = cpu_to_node(0);
 
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
@@ -1382,21 +1382,7 @@ int setup_ioapic_entry(int apic_id, int irq,
 		if (index < 0)
 			panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
 
-		memset(&irte, 0, sizeof(irte));
-
-		irte.present = 1;
-		irte.dst_mode = apic->irq_dest_mode;
-		/*
-		 * Trigger mode in the IRTE will always be edge, and the
-		 * actual level or edge trigger will be setup in the IO-APIC
-		 * RTE. This will help simplify level triggered irq migration.
-		 * For more details, see the comments above explainig IO-APIC
-		 * irq migration in the presence of interrupt-remapping.
-		 */
-		irte.trigger_mode = 0;
-		irte.dlvry_mode = apic->irq_delivery_mode;
-		irte.vector = vector;
-		irte.dest_id = IRTE_DEST(destination);
+		prepare_irte(&irte, vector, destination);
 
 		/* Set source-id of interrupt request */
 		set_ioapic_sid(&irte, apic_id);
@@ -1488,7 +1474,7 @@ static void __init setup_IO_APIC_irqs(void)
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1553,7 +1539,7 @@ static void __init setup_IO_APIC_irqs(void)
 void setup_IO_APIC_irq_extra(u32 gsi)
 {
 	int apic_id = 0, pin, idx, irq;
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
 
@@ -2932,7 +2918,7 @@ static inline void __init check_timer(void)
 {
 	struct irq_desc *desc = irq_to_desc(0);
 	struct irq_cfg *cfg = desc->chip_data;
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	int no_pin1 = 0;
@@ -3286,7 +3272,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 
 int create_irq(void)
 {
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 	unsigned int irq_want;
 	int irq;
 
@@ -3340,14 +3326,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 		ir_index = map_irq_to_irte_handle(irq, &sub_handle);
 		BUG_ON(ir_index == -1);
 
-		memset (&irte, 0, sizeof(irte));
-
-		irte.present = 1;
-		irte.dst_mode = apic->irq_dest_mode;
-		irte.trigger_mode = 0; /* edge */
-		irte.dlvry_mode = apic->irq_delivery_mode;
-		irte.vector = cfg->vector;
-		irte.dest_id = IRTE_DEST(dest);
+		prepare_irte(&irte, cfg->vector, dest);
 
 		/* Set source-id of interrupt request */
 		if (pdev)
@@ -3908,7 +3887,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 	if (dev)
 		node = dev_to_node(dev);
 	else
-		node = cpu_to_node(boot_cpu_id);
+		node = cpu_to_node(0);
 
 	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e..f9e4e6a5407 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
  */
 void __init default_setup_apic_routing(void)
 {
+
+	enable_IR_x2apic();
+
 #ifdef CONFIG_X86_X2APIC
 	if (x2apic_mode
 #ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f0..9e093f8fe78 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	/* calling is from identify_secondary_cpu() ? */
-	if (c->cpu_index == boot_cpu_id)
+	if (!c->cpu_index)
 		return;
 
 	/*
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid)
 #endif
 
 /*
- * Fixup core topology information for AMD multi-node processors.
- * Assumption: Number of cores in each internal node is the same.
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ *     Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
  */
 #ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 {
-	unsigned long long value;
-	u32 nodes, cores_per_node;
+	u32 nodes;
+	u8 node_id;
 	int cpu = smp_processor_id();
 
-	if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
-		return;
+	/* get information required for multi-node processors */
+	if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+		u32 eax, ebx, ecx, edx;
 
-	/* fixup topology information only once for a core */
-	if (cpu_has(c, X86_FEATURE_AMD_DCM))
-		return;
+		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+		nodes = ((ecx >> 8) & 7) + 1;
+		node_id = ecx & 7;
 
-	rdmsrl(MSR_FAM10H_NODE_ID, value);
+		/* get compute unit information */
+		smp_num_siblings = ((ebx >> 8) & 3) + 1;
+		c->compute_unit_id = ebx & 0xff;
+	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+		u64 value;
 
-	nodes = ((value >> 3) & 7) + 1;
-	if (nodes == 1)
+		rdmsrl(MSR_FAM10H_NODE_ID, value);
+		nodes = ((value >> 3) & 7) + 1;
+		node_id = value & 7;
+	} else
 		return;
 
-	set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-	cores_per_node = c->x86_max_cores / nodes;
+	/* fixup multi-node processor information */
+	if (nodes > 1) {
+		u32 cores_per_node;
+
+		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+		cores_per_node = c->x86_max_cores / nodes;
 
-	/* store NodeID, use llc_shared_map to store sibling info */
-	per_cpu(cpu_llc_id, cpu) = value & 7;
+		/* store NodeID, use llc_shared_map to store sibling info */
+		per_cpu(cpu_llc_id, cpu) = node_id;
 
-	/* fixup core id to be in range from 0 to (cores_per_node - 1) */
-	c->cpu_core_id = c->cpu_core_id % cores_per_node;
+		/* core id to be in range from 0 to (cores_per_node - 1) */
+		c->cpu_core_id = c->cpu_core_id % cores_per_node;
+	}
 }
 #endif
 
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
 	c->phys_proc_id = c->initial_apicid >> bits;
 	/* use socket ID also for last level cache */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-	/* fixup topology information on multi-node processors */
-	if ((c->x86 == 0x10) && (c->x86_model == 9))
-		amd_fixup_dcm(c);
+	amd_get_topology(c);
 #endif
 }
 
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
 	}
 #endif
+
+	/* We need to do the following only once */
+	if (c != &boot_cpu_data)
+		return;
+
+	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+		if (c->x86 > 0x10 ||
+		    (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+			u64 val;
+
+			rdmsrl(MSR_K7_HWCR, val);
+			if (!(val & BIT(24)))
+				printk(KERN_WARNING FW_BUG "TSC doesn't count "
+					"with P0 frequency!\n");
+		}
+	}
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 #endif
 
 	if (c->extended_cpuid_level >= 0x80000006) {
-		if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+		if (cpuid_edx(0x80000006) & 0xf000)
 			num_cache_leaves = 4;
 		else
 			num_cache_leaves = 3;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25..4b68bda3093 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		this_cpu->c_early_init(c);
 
 #ifdef CONFIG_SMP
-	c->cpu_index = boot_cpu_id;
+	c->cpu_index = 0;
 #endif
 	filter_cpuid_features(c, false);
 }
@@ -704,16 +704,21 @@ void __init early_cpu_init(void)
 }
 
 /*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect.  In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work.  Disable it completely
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
  * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
  */
 static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
 {
+#ifdef CONFIG_X86_32
 	clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+	set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
 }
 
 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	/*
-	 * Force FPU initialization:
-	 */
-	current_thread_info()->status = 0;
-	clear_used_math();
-	mxcsr_feature_mask_init();
-
 	fpu_init();
 	xsave_init();
 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d4..e765633f210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
 extern const struct cpu_dev *const __x86_cpu_dev_start[],
 			    *const __x86_cpu_dev_end[];
 
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
 extern void get_cpu_cap(struct cpuinfo_x86 *c);
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efb..695f17731e2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -170,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	/* calling is from identify_secondary_cpu() ? */
-	if (c->cpu_index == boot_cpu_id)
+	if (!c->cpu_index)
 		return;
 
 	/*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 898c2f4eab8..12cd823c8d0 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
 
 #include <asm/processor.h>
 #include <linux/smp.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/smp.h>
 
 #define LVL_1_INST	1
@@ -306,7 +306,7 @@ struct _cache_attr {
 	ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
 };
 
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
 
 /*
  * L3 cache descriptors
@@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 			return;
 
 	/* not in virtualized environments */
-	if (num_k8_northbridges == 0)
+	if (k8_northbridges.num == 0)
 		return;
 
 	/*
@@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 	 * never freed but this is done only on shutdown so it doesn't matter.
 	 */
 	if (!l3_caches) {
-		int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+		int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
 
 		l3_caches = kzalloc(size, GFP_ATOMIC);
 		if (!l3_caches)
@@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
 static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
 		show_cache_disable_1, store_cache_disable_1);
 
-#else	/* CONFIG_CPU_SUP_AMD */
+#else	/* CONFIG_AMD_NB */
 static void __cpuinit
 amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
 {
 };
-#endif /* CONFIG_CPU_SUP_AMD */
+#endif /* CONFIG_AMD_NB */
 
 static int
 __cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = {
 
 static struct attribute *default_l3_attrs[] = {
 	DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
 	&cache_disable_0.attr,
 	&cache_disable_1.attr,
 #endif
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f..4b683267eca 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -350,7 +350,7 @@ static void intel_thermal_interrupt(void)
 
 static void unexpected_thermal_interrupt(void)
 {
-	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
 			smp_processor_id());
 	add_taint(TAINT_MACHINE_CHECK);
 }
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d07142..ac140c7be39 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return 0;
-	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+	if (boot_cpu_data.x86 < 0xf)
 		return 0;
 	/* In case some hypervisor doesn't pass SYSCFG through: */
 	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d0388..9f27228ceff 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
 	}
 }
 
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+	u64 size;
+
+	mask >>= PAGE_SHIFT;
+	mask |= size_or_mask;
+	size = -mask;
+	size <<= PAGE_SHIFT;
+	return size;
+}
+
 /*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
  */
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+	if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+		*prev = MTRR_TYPE_UNCACHABLE;
+		*curr = MTRR_TYPE_UNCACHABLE;
+		return 1;
+	}
+
+	if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+	    (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+		*prev = MTRR_TYPE_WRTHROUGH;
+		*curr = MTRR_TYPE_WRTHROUGH;
+	}
+
+	if (*prev != *curr) {
+		*prev = MTRR_TYPE_UNCACHABLE;
+		*curr = MTRR_TYPE_UNCACHABLE;
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Error/Semi-error returns:
+ * 0xFF - when MTRR is not enabled
+ * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
+ *		corresponds only to [start:*partial_end].
+ *		Caller has to lookup again for [*partial_end:end].
+ */
+static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 {
 	int i;
 	u64 base, mask;
 	u8 prev_match, curr_match;
 
+	*repeat = 0;
 	if (!mtrr_state_set)
 		return 0xFF;
 
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 
 		start_state = ((start & mask) == (base & mask));
 		end_state = ((end & mask) == (base & mask));
-		if (start_state != end_state)
-			return 0xFE;
+
+		if (start_state != end_state) {
+			/*
+			 * We have start:end spanning across an MTRR.
+			 * We split the region into
+			 * either
+			 * (start:mtrr_end) (mtrr_end:end)
+			 * or
+			 * (start:mtrr_start) (mtrr_start:end)
+			 * depending on kind of overlap.
+			 * Return the type for first region and a pointer to
+			 * the start of second region so that caller will
+			 * lookup again on the second region.
+			 * Note: This way we handle multiple overlaps as well.
+			 */
+			if (start_state)
+				*partial_end = base + get_mtrr_size(mask);
+			else
+				*partial_end = base;
+
+			if (unlikely(*partial_end <= start)) {
+				WARN_ON(1);
+				*partial_end = start + PAGE_SIZE;
+			}
+
+			end = *partial_end - 1; /* end is inclusive */
+			*repeat = 1;
+		}
 
 		if ((start & mask) != (base & mask))
 			continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 			continue;
 		}
 
-		if (prev_match == MTRR_TYPE_UNCACHABLE ||
-		    curr_match == MTRR_TYPE_UNCACHABLE) {
-			return MTRR_TYPE_UNCACHABLE;
-		}
-
-		if ((prev_match == MTRR_TYPE_WRBACK &&
-		     curr_match == MTRR_TYPE_WRTHROUGH) ||
-		    (prev_match == MTRR_TYPE_WRTHROUGH &&
-		     curr_match == MTRR_TYPE_WRBACK)) {
-			prev_match = MTRR_TYPE_WRTHROUGH;
-			curr_match = MTRR_TYPE_WRTHROUGH;
-		}
-
-		if (prev_match != curr_match)
-			return MTRR_TYPE_UNCACHABLE;
+		if (check_type_overlap(&prev_match, &curr_match))
+			return curr_match;
 	}
 
 	if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 	return mtrr_state.def_type;
 }
 
+/*
+ * Returns the effective MTRR type for the region
+ * Error return:
+ * 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+	u8 type, prev_type;
+	int repeat;
+	u64 partial_end;
+
+	type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+	/*
+	 * Common path is with repeat = 0.
+	 * However, we can have cases where [start:end] spans across some
+	 * MTRR range. Do repeated lookups for that case here.
+	 */
+	while (repeat) {
+		prev_type = type;
+		start = partial_end;
+		type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+		if (check_type_overlap(&prev_type, &type))
+			return type;
+	}
+
+	return type;
+}
+
 /* Get the MSR pair relating to a var range */
 static void
 get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 03a5b0385ad..fe73c1844a9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -531,7 +531,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 /*
  * Setup the hardware configuration for a given attr_type
  */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
@@ -584,7 +584,7 @@ static void x86_pmu_disable_all(void)
 	}
 }
 
-void hw_perf_disable(void)
+static void x86_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -619,7 +619,7 @@ static void x86_pmu_enable_all(int added)
 	}
 }
 
-static const struct pmu pmu;
+static struct pmu pmu;
 
 static inline int is_x86_event(struct perf_event *event)
 {
@@ -801,10 +801,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
 		hwc->last_tag == cpuc->tags[i];
 }
 
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
 
-void hw_perf_enable(void)
+static void x86_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
@@ -840,7 +840,14 @@ void hw_perf_enable(void)
 			    match_prev_assignment(hwc, cpuc, i))
 				continue;
 
-			x86_pmu_stop(event);
+			/*
+			 * Ensure we don't accidentally enable a stopped
+			 * counter simply because we rescheduled.
+			 */
+			if (hwc->state & PERF_HES_STOPPED)
+				hwc->state |= PERF_HES_ARCH;
+
+			x86_pmu_stop(event, PERF_EF_UPDATE);
 		}
 
 		for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +859,10 @@ void hw_perf_enable(void)
 			else if (i < n_running)
 				continue;
 
-			x86_pmu_start(event);
+			if (hwc->state & PERF_HES_ARCH)
+				continue;
+
+			x86_pmu_start(event, PERF_EF_RELOAD);
 		}
 		cpuc->n_added = 0;
 		perf_events_lapic_init();
@@ -953,15 +963,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
 }
 
 /*
- * activate a single event
+ * Add a single event to the PMU.
  *
  * The event is added to the group of enabled events
  * but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
  */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc;
@@ -970,58 +977,67 @@ static int x86_pmu_enable(struct perf_event *event)
 
 	hwc = &event->hw;
 
+	perf_pmu_disable(event->pmu);
 	n0 = cpuc->n_events;
-	n = collect_events(cpuc, event, false);
-	if (n < 0)
-		return n;
+	ret = n = collect_events(cpuc, event, false);
+	if (ret < 0)
+		goto out;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
 
 	/*
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be peformed
-	 * at commit time(->commit_txn) as a whole
+	 * at commit time (->commit_txn) as a whole
 	 */
 	if (cpuc->group_flag & PERF_EVENT_TXN)
-		goto out;
+		goto done_collect;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
-		return ret;
+		goto out;
 	/*
 	 * copy new assignment, now we know it is possible
 	 * will be used by hw_perf_enable()
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-out:
+done_collect:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
 	cpuc->n_txn += n - n0;
 
-	return 0;
+	ret = 0;
+out:
+	perf_pmu_enable(event->pmu);
+	return ret;
 }
 
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx = event->hw.idx;
 
-	if (idx == -1)
-		return -EAGAIN;
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		x86_perf_event_set_period(event);
+	}
+
+	event->hw.state = 0;
 
-	x86_perf_event_set_period(event);
 	cpuc->events[idx] = event;
 	__set_bit(idx, cpuc->active_mask);
 	__set_bit(idx, cpuc->running);
 	x86_pmu.enable(event);
 	perf_event_update_userpage(event);
-
-	return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
-	int ret = x86_pmu_start(event);
-	WARN_ON_ONCE(ret);
 }
 
 void perf_event_print_debug(void)
@@ -1078,27 +1094,29 @@ void perf_event_print_debug(void)
 	local_irq_restore(flags);
 }
 
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
 
-	if (!__test_and_clear_bit(idx, cpuc->active_mask))
-		return;
-
-	x86_pmu.disable(event);
-
-	/*
-	 * Drain the remaining delta count out of a event
-	 * that we are disabling:
-	 */
-	x86_perf_event_update(event);
+	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+		x86_pmu.disable(event);
+		cpuc->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
 
-	cpuc->events[idx] = NULL;
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		x86_perf_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
 }
 
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int i;
@@ -1111,7 +1129,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
-	x86_pmu_stop(event);
+	x86_pmu_stop(event, PERF_EF_UPDATE);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event_list[i]) {
@@ -1134,7 +1152,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 	struct perf_sample_data data;
 	struct cpu_hw_events *cpuc;
 	struct perf_event *event;
-	struct hw_perf_event *hwc;
 	int idx, handled = 0;
 	u64 val;
 
@@ -1155,7 +1172,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		}
 
 		event = cpuc->events[idx];
-		hwc = &event->hw;
 
 		val = x86_perf_event_update(event);
 		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1187,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 			continue;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			x86_pmu_stop(event);
+			x86_pmu_stop(event, 0);
 	}
 
 	if (handled)
@@ -1180,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 	return handled;
 }
 
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	ack_APIC_irq();
-	inc_irq_stat(apic_pending_irqs);
-	perf_event_do_pending();
-	irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
 void perf_events_lapic_init(void)
 {
 	if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1388,7 +1385,6 @@ void __init init_hw_perf_events(void)
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 	}
 	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-	perf_max_events = x86_pmu.num_counters;
 
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,6 +1420,7 @@ void __init init_hw_perf_events(void)
 	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
+	perf_pmu_register(&pmu);
 	perf_cpu_notifier(x86_pmu_notifier);
 }
 
@@ -1437,10 +1434,11 @@ static inline void x86_pmu_read(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	perf_pmu_disable(pmu);
 	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
@@ -1450,7 +1448,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
  * Clear the flag and pmu::enable() will perform the
  * schedulability test.
  */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -1460,6 +1458,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 	 */
 	cpuc->n_added -= cpuc->n_txn;
 	cpuc->n_events -= cpuc->n_txn;
+	perf_pmu_enable(pmu);
 }
 
 /*
@@ -1467,7 +1466,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
  * Perform the group schedulability test as a whole
  * Return 0 if success
  */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1488,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+	perf_pmu_enable(pmu);
 	return 0;
 }
 
-static const struct pmu pmu = {
-	.enable		= x86_pmu_enable,
-	.disable	= x86_pmu_disable,
-	.start		= x86_pmu_start,
-	.stop		= x86_pmu_stop,
-	.read		= x86_pmu_read,
-	.unthrottle	= x86_pmu_unthrottle,
-	.start_txn	= x86_pmu_start_txn,
-	.cancel_txn	= x86_pmu_cancel_txn,
-	.commit_txn	= x86_pmu_commit_txn,
-};
-
 /*
  * validate that we can schedule this event
  */
@@ -1579,12 +1566,22 @@ out:
 	return ret;
 }
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
 {
-	const struct pmu *tmp;
+	struct pmu *tmp;
 	int err;
 
-	err = __hw_perf_event_init(event);
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	err = __x86_pmu_event_init(event);
 	if (!err) {
 		/*
 		 * we temporarily connect event to its pmu
@@ -1604,26 +1601,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	if (err) {
 		if (event->destroy)
 			event->destroy(event);
-		return ERR_PTR(err);
 	}
 
-	return &pmu;
+	return err;
 }
 
-/*
- * callchain support
- */
+static struct pmu pmu = {
+	.pmu_enable	= x86_pmu_enable,
+	.pmu_disable	= x86_pmu_disable,
 
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
-		entry->ip[entry->nr++] = ip;
-}
+	.event_init	= x86_pmu_event_init,
 
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+	.add		= x86_pmu_add,
+	.del		= x86_pmu_del,
+	.start		= x86_pmu_start,
+	.stop		= x86_pmu_stop,
+	.read		= x86_pmu_read,
 
+	.start_txn	= x86_pmu_start_txn,
+	.cancel_txn	= x86_pmu_cancel_txn,
+	.commit_txn	= x86_pmu_commit_txn,
+};
+
+/*
+ * callchain support
+ */
 
 static void
 backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
@@ -1645,7 +1647,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	callchain_store(entry, addr);
+	perf_callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops backtrace_ops = {
@@ -1656,11 +1658,15 @@ static const struct stacktrace_ops backtrace_ops = {
 	.walk_stack		= print_context_stack_bp,
 };
 
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->ip);
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return;
+	}
+
+	perf_callchain_store(entry, regs->ip);
 
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
 }
@@ -1689,7 +1695,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if (fp < compat_ptr(regs->sp))
 			break;
 
-		callchain_store(entry, frame.return_address);
+		perf_callchain_store(entry, frame.return_address);
 		fp = compat_ptr(frame.next_frame);
 	}
 	return 1;
@@ -1702,19 +1708,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #endif
 
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	struct stack_frame frame;
 	const void __user *fp;
 
-	if (!user_mode(regs))
-		regs = task_pt_regs(current);
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return;
+	}
 
 	fp = (void __user *)regs->bp;
 
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, regs->ip);
+	perf_callchain_store(entry, regs->ip);
 
 	if (perf_callchain_user32(regs, entry))
 		return;
@@ -1731,52 +1738,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if ((unsigned long)fp < regs->sp)
 			break;
 
-		callchain_store(entry, frame.return_address);
+		perf_callchain_store(entry, frame.return_address);
 		fp = frame.next_frame;
 	}
 }
 
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	int is_user;
-
-	if (!regs)
-		return;
-
-	is_user = user_mode(regs);
-
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
-	if (!is_user)
-		perf_callchain_kernel(regs, entry);
-
-	if (current->mm)
-		perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	struct perf_callchain_entry *entry;
-
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		/* TODO: We don't support guest os callchain now */
-		return NULL;
-	}
-
-	if (in_nmi())
-		entry = &__get_cpu_var(pmc_nmi_entry);
-	else
-		entry = &__get_cpu_var(pmc_irq_entry);
-
-	entry->nr = 0;
-
-	perf_do_callchain(regs, entry);
-
-	return entry;
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3..46d58448c3a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
+		[ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+		[ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d..c8f5c088cad 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	struct cpu_hw_events *cpuc;
 	int bit, loops;
 	u64 status;
-	int handled = 0;
+	int handled;
 
 	perf_sample_data_init(&data, 0);
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	intel_pmu_disable_all();
-	intel_pmu_drain_bts_buffer();
+	handled = intel_pmu_drain_bts_buffer();
 	status = intel_pmu_get_status();
 	if (!status) {
 		intel_pmu_enable_all(0);
-		return 0;
+		return handled;
 	}
 
 	loops = 0;
@@ -763,7 +763,7 @@ again:
 		data.period = event->hw.last_period;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			x86_pmu_stop(event);
+			x86_pmu_stop(event, 0);
 	}
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311c..4977f9c400e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void)
 	update_debugctlmsr(debugctlmsr);
 }
 
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
@@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void)
 	struct pt_regs regs;
 
 	if (!event)
-		return;
+		return 0;
 
 	if (!ds)
-		return;
+		return 0;
 
 	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
 	top = (struct bts_record *)(unsigned long)ds->bts_index;
 
 	if (top <= at)
-		return;
+		return 0;
 
 	ds->bts_index = ds->bts_buffer_base;
 
@@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void)
 	perf_prepare_sample(&header, &data, event, &regs);
 
 	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
-		return;
+		return 1;
 
 	for (; at < top; at++) {
 		data.ip		= at->from;
@@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void)
 	/* There's new data available. */
 	event->hw.interrupts++;
 	event->pending_kill = POLL_IN;
+	return 1;
 }
 
 /*
@@ -491,7 +492,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
 	if (perf_event_overflow(event, 1, &data, &regs))
-		x86_pmu_stop(event);
+		x86_pmu_stop(event, 0);
 }
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 24901517399..81400b93e69 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,8 @@
 struct p4_event_bind {
 	unsigned int opcode;			/* Event code and ESCR selector */
 	unsigned int escr_msr[2];		/* ESCR MSR for this event */
+	unsigned int escr_emask;		/* valid ESCR EventMask bits */
+	unsigned int shared;			/* event is shared across threads */
 	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
 };
 
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
 	[P4_EVENT_TC_DELIVER_MODE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
 		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+		.shared		= 1,
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_BPU_FETCH_REQUEST] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
 		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_ITLB_REFERENCE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
 		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_MEMORY_CANCEL] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
 		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_MEMORY_COMPLETE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
 		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_LOAD_PORT_REPLAY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
 		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_STORE_PORT_REPLAY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
 		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_MOB_LOAD_REPLAY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
 		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_PAGE_WALK_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
 		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+		.shared		= 1,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_IOQ_ALLOCATION] = {
 		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
 		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
 		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
 		.cntr		= { {2, -1, -1}, {3, -1, -1} },
 	},
 	[P4_EVENT_FSB_DATA_ACTIVITY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+		.shared		= 1,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
 		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
 		.cntr		= { {0, -1, -1}, {1, -1, -1} },
 	},
 	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
 		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
 		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
 		.cntr		= { {2, -1, -1}, {3, -1, -1} },
 	},
 	[P4_EVENT_SSE_INPUT_ASSIST] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_PACKED_SP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_PACKED_DP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_SCALAR_SP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_SCALAR_DP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_64BIT_MMX_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_128BIT_MMX_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_X87_FP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_TC_MISC] = {
 		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
 		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
 		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_TC_MS_XFER] = {
 		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
 		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_UOP_QUEUE_WRITES] = {
 		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
 		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
 		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
 		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_RESOURCE_STALL] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
 		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_WC_BUFFER] = {
 		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
 		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_B2B_CYCLES] = {
 		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_BNR] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BNR),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_SNOOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_RESPONSE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_FRONT_END_EVENT] = {
 		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_EXECUTION_EVENT] = {
 		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_REPLAY_EVENT] = {
 		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_INSTR_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_UOPS_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_UOP_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
 		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_BRANCH_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_X87_ASSIST] = {
 		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_MACHINE_CLEAR] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_INSTR_COMPLETED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 };
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
 	return config;
 }
 
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+	/* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+	if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+		if (boot_cpu_data.x86_model != 3 &&
+			boot_cpu_data.x86_model != 4 &&
+			boot_cpu_data.x86_model != 6)
+			return false;
+	}
+
+	/*
+	 * For info
+	 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+	 */
+
+	return true;
+}
+
 static int p4_validate_raw_event(struct perf_event *event)
 {
-	unsigned int v;
+	unsigned int v, emask;
 
-	/* user data may have out-of-bound event index */
+	/* User data may have out-of-bound event index */
 	v = p4_config_unpack_event(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_event_bind_map)) {
-		pr_warning("P4 PMU: Unknown event code: %d\n", v);
+	if (v >= ARRAY_SIZE(p4_event_bind_map))
+		return -EINVAL;
+
+	/* It may be unsupported: */
+	if (!p4_event_match_cpu_model(v))
 		return -EINVAL;
+
+	/*
+	 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+	 * in Architectural Performance Monitoring, it means not
+	 * on _which_ logical cpu to count but rather _when_, ie it
+	 * depends on logical cpu state -- count event if one cpu active,
+	 * none, both or any, so we just allow user to pass any value
+	 * desired.
+	 *
+	 * In turn we always set Tx_OS/Tx_USR bits bound to logical
+	 * cpu without their propagation to another cpu
+	 */
+
+	/*
+	 * if an event is shared accross the logical threads
+	 * the user needs special permissions to be able to use it
+	 */
+	if (p4_event_bind_map[v].shared) {
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
 	}
 
+	/* ESCR EventMask bits may be invalid */
+	emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+	if (emask & ~p4_event_bind_map[v].escr_emask)
+		return -EINVAL;
+
 	/*
-	 * it may have some screwed PEBS bits
+	 * it may have some invalid PEBS bits
 	 */
-	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
-		pr_warning("P4 PMU: PEBS are not supported yet\n");
+	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
 		return -EINVAL;
-	}
+
 	v = p4_config_unpack_metric(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
-		pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+	if (v >= ARRAY_SIZE(p4_pebs_bind_map))
 		return -EINVAL;
-	}
 
 	return 0;
 }
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event)
 
 	if (event->attr.type == PERF_TYPE_RAW) {
 
+		/*
+		 * Clear bits we reserve to be managed by kernel itself
+		 * and never allowed from a user space
+		 */
+		 event->attr.config &= P4_CONFIG_MASK;
+
 		rc = p4_validate_raw_event(event);
 		if (rc)
 			goto out;
 
 		/*
-		 * We don't control raw events so it's up to the caller
-		 * to pass sane values (and we don't count the thread number
-		 * on HT machine but allow HT-compatible specifics to be
-		 * passed on)
-		 *
 		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
 		 * bits since we keep additional info here (for cache events and etc)
-		 *
-		 * XXX: HT wide things should check perf_paranoid_cpu() &&
-		 *      CAP_SYS_ADMIN
 		 */
-		event->hw.config |= event->attr.config &
-			(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-			 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
-
-		event->hw.config &= ~P4_CCCR_FORCE_OVF;
+		event->hw.config |= event->attr.config;
 	}
 
 	rc = x86_setup_perfctr(event);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f849..d9f4ff8fcd6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
-		if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
-		    boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
-			return;
-		wd_ops = &k7_wd_ops;
-		break;
+		if (boot_cpu_data.x86 == 6 ||
+		    (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
+			wd_ops = &k7_wd_ops;
+		return;
 	case X86_VENDOR_INTEL:
 		/* Work around where perfctr1 doesn't have a working enable
 		 * bit as described in the following errata:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d4907951512..c7f64e6f537 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
 		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
 		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
+		{ X86_FEATURE_TSCRATEMSR,	CR_EDX, 4, 0x8000000a, 0 },
+		{ X86_FEATURE_VMCBCLEAN,	CR_EDX, 5, 0x8000000a, 0 },
+		{ X86_FEATURE_FLUSHBYASID,	CR_EDX, 6, 0x8000000a, 0 },
+		{ X86_FEATURE_DECODEASSISTS,	CR_EDX, 7, 0x8000000a, 0 },
+		{ X86_FEATURE_PAUSEFILTER,	CR_EDX,10, 0x8000000a, 0 },
+		{ X86_FEATURE_PFTHRESHOLD,	CR_EDX,12, 0x8000000a, 0 },
 		{ 0, 0, 0, 0, 0 }
 	};
 
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada6..994828899e0 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	if (!csize)
 		return 0;
 
-	vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+	vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
 	if (!vaddr)
 		return -ENOMEM;
 
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	} else
 		memcpy(buf, vaddr + offset, csize);
 
+	set_iounmap_nonlazy();
 	iounmap(vaddr);
 	return csize;
 }
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf268..76b8cd953de 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
 }
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 {
 	u32 d;
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 	d &= 0xff;
 	return d;
 }
-#endif
 
 static void __init ati_bugs(int num, int slot, int func)
 {
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ac..4572f25f932 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
 #include <xen/hvc-console.h>
 #include <asm/pci-direct.h>
 #include <asm/fixmap.h>
+#include <asm/mrst.h>
 #include <asm/pgtable.h>
 #include <linux/usb/ehci_def.h>
 
@@ -239,6 +240,18 @@ static int __init setup_early_printk(char *buf)
 		if (!strncmp(buf, "xen", 3))
 			early_console_register(&xenboot_console, keep);
 #endif
+#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+		if (!strncmp(buf, "mrst", 4)) {
+			mrst_early_console_init();
+			early_console_register(&early_mrst_console, keep);
+		}
+
+		if (!strncmp(buf, "hsu", 3)) {
+			hsu_early_console_init();
+			early_console_register(&early_hsu_console, keep);
+		}
+
+#endif
 		buf++;
 	}
 	return 0;
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
new file mode 100644
index 00000000000..65df603622b
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,319 @@
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT		0x200000
+#define MRST_REGBASE_SPI0		0xff128000
+#define MRST_REGBASE_SPI1		0xff128400
+#define MRST_CLK_SPI0_REG		0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET			0
+
+#define SPI_FRF_OFFSET			4
+#define SPI_FRF_SPI			0x0
+#define SPI_FRF_SSP			0x1
+#define SPI_FRF_MICROWIRE		0x2
+#define SPI_FRF_RESV			0x3
+
+#define SPI_MODE_OFFSET			6
+#define SPI_SCPH_OFFSET			6
+#define SPI_SCOL_OFFSET			7
+#define SPI_TMOD_OFFSET			8
+#define	SPI_TMOD_TR			0x0		/* xmit & recv */
+#define SPI_TMOD_TO			0x1		/* xmit only */
+#define SPI_TMOD_RO			0x2		/* recv only */
+#define SPI_TMOD_EPROMREAD		0x3		/* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET		10
+#define SPI_SRL_OFFSET			11
+#define SPI_CFS_OFFSET			12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK				0x7f		/* cover 7 bits */
+#define SR_BUSY				(1 << 0)
+#define SR_TF_NOT_FULL			(1 << 1)
+#define SR_TF_EMPT			(1 << 2)
+#define SR_RF_NOT_EMPT			(1 << 3)
+#define SR_RF_FULL			(1 << 4)
+#define SR_TX_ERR			(1 << 5)
+#define SR_DCOL				(1 << 6)
+
+struct dw_spi_reg {
+	u32	ctrl0;
+	u32	ctrl1;
+	u32	ssienr;
+	u32	mwcr;
+	u32	ser;
+	u32	baudr;
+	u32	txfltr;
+	u32	rxfltr;
+	u32	txflr;
+	u32	rxflr;
+	u32	sr;
+	u32	imr;
+	u32	isr;
+	u32	risr;
+	u32	txoicr;
+	u32	rxoicr;
+	u32	rxuicr;
+	u32	msticr;
+	u32	icr;
+	u32	dmacr;
+	u32	dmatdlr;
+	u32	dmardlr;
+	u32	idr;
+	u32	version;
+
+	/* Currently operates as 32 bits, though only the low 16 bits matter */
+	u32	dr;
+} __packed;
+
+#define dw_readl(dw, name)		__raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val)	__raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+			enum kmsg_dump_reason reason,
+			const char *s1, unsigned long l1,
+			const char *s2, unsigned long l2)
+{
+	int i;
+
+	/* When run to this, we'd better re-init the HW */
+	mrst_early_console_init();
+
+	for (i = 0; i < l1; i++)
+		early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+	for (i = 0; i < l2; i++)
+		early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+	u16 config;
+
+	config = 0xc001;
+	dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+	u16 data;
+
+	data = 0x8000 | c;
+	dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+	u32 ctrlr0 = 0;
+	u32 spi0_cdiv;
+	u32 freq; /* Freqency info only need be searched once */
+
+	/* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+	pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+							MRST_CLK_SPI0_REG);
+	spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+	freq = 100000000 / (spi0_cdiv + 1);
+
+	if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+		mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+	pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+						mrst_spi_paddr);
+
+	/* Disable SPI controller */
+	dw_writel(pspi, ssienr, 0);
+
+	/* Set control param, 8 bits, transmit only mode */
+	ctrlr0 = dw_readl(pspi, ctrl0);
+
+	ctrlr0 &= 0xfcc0;
+	ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+		      | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+	dw_writel(pspi, ctrl0, ctrlr0);
+
+	/*
+	 * Change the spi0 clk to comply with 115200 bps, use 100000 to
+	 * calculate the clk dividor to make the clock a little slower
+	 * than real baud rate.
+	 */
+	dw_writel(pspi, baudr, freq/100000);
+
+	/* Disable all INT for early phase */
+	dw_writel(pspi, imr, 0x0);
+
+	/* Set the cs to spi-uart */
+	dw_writel(pspi, ser, 0x2);
+
+	/* Enable the HW, the last step for HW init */
+	dw_writel(pspi, ssienr, 0x1);
+
+	/* Set the default configuration */
+	max3110_write_config();
+
+	/* Register the kmsg dumper */
+	if (!dumper_registered) {
+		dw_dumper.dump = dw_kmsg_dump;
+		kmsg_dump_register(&dw_dumper);
+		dumper_registered = 1;
+	}
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+	unsigned int timeout;
+	u32 sr;
+
+	timeout = MRST_SPI_TIMEOUT;
+	/* Early putc needs to make sure the TX FIFO is not full */
+	while (--timeout) {
+		sr = dw_readl(pspi, sr);
+		if (!(sr & SR_TF_NOT_FULL))
+			cpu_relax();
+		else
+			break;
+	}
+
+	if (!timeout)
+		pr_warning("MRST earlycon: timed out\n");
+	else
+		max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+	int i;
+
+	for (i = 0; i < n && *str; i++) {
+		if (*str == '\n')
+			early_mrst_spi_putc('\r');
+		early_mrst_spi_putc(*str);
+		str++;
+	}
+}
+
+struct console early_mrst_console = {
+	.name =		"earlymrst",
+	.write =	early_mrst_spi_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR		0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+	u8 lcr;
+
+	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+							HSU_PORT2_PADDR);
+
+	/* Disable FIFO */
+	writeb(0x0, phsu + UART_FCR);
+
+	/* Set to default 115200 bps, 8n1 */
+	lcr = readb(phsu + UART_LCR);
+	writeb((0x80 | lcr), phsu + UART_LCR);
+	writeb(0x18, phsu + UART_DLL);
+	writeb(lcr,  phsu + UART_LCR);
+	writel(0x3600, phsu + UART_MUL*4);
+
+	writeb(0x8, phsu + UART_MCR);
+	writeb(0x7, phsu + UART_FCR);
+	writeb(0x3, phsu + UART_LCR);
+
+	/* Clear IRQ status */
+	readb(phsu + UART_LSR);
+	readb(phsu + UART_RX);
+	readb(phsu + UART_IIR);
+	readb(phsu + UART_MSR);
+
+	/* Enable FIFO */
+	writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+	unsigned int timeout = 10000; /* 10ms */
+	u8 status;
+
+	while (--timeout) {
+		status = readb(phsu + UART_LSR);
+		if (status & BOTH_EMPTY)
+			break;
+		udelay(1);
+	}
+
+	/* Only write the char when there was no timeout */
+	if (timeout)
+		writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+	int i;
+
+	for (i = 0; i < n && *str; i++) {
+		if (*str == '\n')
+			early_hsu_putc('\r');
+		early_hsu_putc(*str);
+		str++;
+	}
+}
+
+struct console early_hsu_console = {
+	.name =		"earlyhsu",
+	.write =	early_hsu_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2..9fb188d7bc7 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
 
  /* unfortunately push/pop can't be no-op */
 .macro PUSH_GS
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
 .endm
 .macro POP_GS pop=0
 	addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
 #else	/* CONFIG_X86_32_LAZY_GS */
 
 .macro PUSH_GS
-	pushl %gs
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %gs
 	/*CFI_REL_OFFSET gs, 0*/
 .endm
 
 .macro POP_GS pop=0
-98:	popl %gs
-	CFI_ADJUST_CFA_OFFSET -4
+98:	popl_cfi %gs
 	/*CFI_RESTORE gs*/
   .if \pop <> 0
 	add $\pop, %esp
@@ -195,35 +192,25 @@
 .macro SAVE_ALL
 	cld
 	PUSH_GS
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %fs
 	/*CFI_REL_OFFSET fs, 0;*/
-	pushl %es
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %es
 	/*CFI_REL_OFFSET es, 0;*/
-	pushl %ds
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ds
 	/*CFI_REL_OFFSET ds, 0;*/
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	CFI_REL_OFFSET eax, 0
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebp
 	CFI_REL_OFFSET ebp, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edx
 	CFI_REL_OFFSET edx, 0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl $(__USER_DS), %edx
 	movl %edx, %ds
@@ -234,39 +221,29 @@
 .endm
 
 .macro RESTORE_INT_REGS
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx
 	CFI_RESTORE ecx
-	popl %edx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edx
 	CFI_RESTORE edx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
-	popl %edi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edi
 	CFI_RESTORE edi
-	popl %ebp
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebp
 	CFI_RESTORE ebp
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	CFI_RESTORE eax
 .endm
 
 .macro RESTORE_REGS pop=0
 	RESTORE_INT_REGS
-1:	popl %ds
-	CFI_ADJUST_CFA_OFFSET -4
+1:	popl_cfi %ds
 	/*CFI_RESTORE ds;*/
-2:	popl %es
-	CFI_ADJUST_CFA_OFFSET -4
+2:	popl_cfi %es
 	/*CFI_RESTORE es;*/
-3:	popl %fs
-	CFI_ADJUST_CFA_OFFSET -4
+3:	popl_cfi %fs
 	/*CFI_RESTORE fs;*/
 	POP_GS \pop
 .pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
 
 ENTRY(ret_from_fork)
 	CFI_STARTPROC
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	call schedule_tail
 	GET_THREAD_INFO(%ebp)
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
-	pushl $0x0202			# Reset kernel eflags
-	CFI_ADJUST_CFA_OFFSET 4
-	popfl
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
+	pushl_cfi $0x0202		# Reset kernel eflags
+	popfl_cfi
 	jmp syscall_exit
 	CFI_ENDPROC
 END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
 	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
 	 * we immediately enable interrupts at that point anyway.
 	 */
-	pushl $(__USER_DS)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $(__USER_DS)
 	/*CFI_REL_OFFSET ss, 0*/
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebp
 	CFI_REL_OFFSET esp, 0
-	pushfl
+	pushfl_cfi
 	orl $X86_EFLAGS_IF, (%esp)
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $(__USER_CS)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $(__USER_CS)
 	/*CFI_REL_OFFSET cs, 0*/
 	/*
 	 * Push current_thread_info()->sysenter_return to the stack.
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	SAVE_ALL
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
@@ -486,8 +453,7 @@ sysenter_audit:
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
 	call audit_syscall_entry
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
 
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
 	# system call handler stub
 ENTRY(system_call)
 	RING0_INT_FRAME			# can't unwind into user space anyway
-	pushl %eax			# save orig_eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax			# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
 	RESTORE_REGS 4			# skip orig_eax/error_code
-	CFI_ADJUST_CFA_OFFSET -4
 irq_return:
 	INTERRUPT_RETURN
 .section .fixup,"ax"
@@ -619,10 +583,8 @@ ldt_ss:
 	shr $16, %edx
 	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
 	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-	pushl $__ESPFIX_SS
-	CFI_ADJUST_CFA_OFFSET 4
-	push %eax			/* new kernel esp */
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $__ESPFIX_SS
+	pushl_cfi %eax			/* new kernel esp */
 	/* Disable interrupts, but do not irqtrace this section: we
 	 * will soon execute iret and the tracer was already set to
 	 * the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig:				# deal with pending signals and
 
 	ALIGN
 work_notifysig_v86:
-	pushl %ecx			# save ti_flags for do_notify_resume
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx			# save ti_flags for do_notify_resume
 	call save_v86_state		# %eax contains pt_regs pointer
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx
 	movl %eax, %esp
 #else
 	movl %esp, %eax
@@ -750,14 +710,18 @@ ptregs_##name: \
 #define PTREGSCALL3(name) \
 	ALIGN; \
 ptregs_##name: \
+	CFI_STARTPROC; \
 	leal 4(%esp),%eax; \
-	pushl %eax; \
+	pushl_cfi %eax; \
 	movl PT_EDX(%eax),%ecx; \
 	movl PT_ECX(%eax),%edx; \
 	movl PT_EBX(%eax),%eax; \
 	call sys_##name; \
 	addl $4,%esp; \
-	ret
+	CFI_ADJUST_CFA_OFFSET -4; \
+	ret; \
+	CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
 
 PTREGSCALL1(iopl)
 PTREGSCALL0(fork)
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old)
 /* Clone is an oddball.  The 4th arg is in %edi */
 	ALIGN;
 ptregs_clone:
+	CFI_STARTPROC
 	leal 4(%esp),%eax
-	pushl %eax
-	pushl PT_EDI(%eax)
+	pushl_cfi %eax
+	pushl_cfi PT_EDI(%eax)
 	movl PT_EDX(%eax),%ecx
 	movl PT_ECX(%eax),%edx
 	movl PT_EBX(%eax),%eax
 	call sys_clone
 	addl $8,%esp
+	CFI_ADJUST_CFA_OFFSET -8
 	ret
+	CFI_ENDPROC
+ENDPROC(ptregs_clone)
 
 .macro FIXUP_ESPFIX_STACK
 /*
@@ -795,10 +763,8 @@ ptregs_clone:
 	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
 	shl $16, %eax
 	addl %esp, %eax			/* the adjusted stack pointer */
-	pushl $__KERNEL_DS
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $__KERNEL_DS
+	pushl_cfi %eax
 	lss (%esp), %esp		/* switch to the normal stack segment */
 	CFI_ADJUST_CFA_OFFSET -8
 .endm
@@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .if vector <> FIRST_EXTERNAL_VECTOR
 	CFI_ADJUST_CFA_OFFSET -4
       .endif
-1:	pushl $(~vector+0x80)	/* Note: always in signed byte range */
-	CFI_ADJUST_CFA_OFFSET 4
+1:	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
       .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
 	jmp 2f
       .endif
@@ -876,8 +841,7 @@ ENDPROC(common_interrupt)
 #define BUILD_INTERRUPT3(name, nr, fn)	\
 ENTRY(name)				\
 	RING0_INT_FRAME;		\
-	pushl $~(nr);			\
-	CFI_ADJUST_CFA_OFFSET 4;	\
+	pushl_cfi $~(nr);		\
 	SAVE_ALL;			\
 	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
@@ -893,21 +857,18 @@ ENDPROC(name)
 
 ENTRY(coprocessor_error)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_coprocessor_error
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_coprocessor_error
 	jmp error_code
 	CFI_ENDPROC
 END(coprocessor_error)
 
 ENTRY(simd_coprocessor_error)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
 #ifdef CONFIG_X86_INVD_BUG
 	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:	pushl $do_general_protection
+661:	pushl_cfi $do_general_protection
 662:
 .section .altinstructions,"a"
 	.balign 4
@@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error)
 664:
 .previous
 #else
-	pushl $do_simd_coprocessor_error
+	pushl_cfi $do_simd_coprocessor_error
 #endif
-	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
 END(simd_coprocessor_error)
 
 ENTRY(device_not_available)
 	RING0_INT_FRAME
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_device_not_available
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $-1			# mark this as an int
+	pushl_cfi $do_device_not_available
 	jmp error_code
 	CFI_ENDPROC
 END(device_not_available)
@@ -956,82 +914,68 @@ END(native_irq_enable_sysexit)
 
 ENTRY(overflow)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_overflow
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_overflow
 	jmp error_code
 	CFI_ENDPROC
 END(overflow)
 
 ENTRY(bounds)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_bounds
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_bounds
 	jmp error_code
 	CFI_ENDPROC
 END(bounds)
 
 ENTRY(invalid_op)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_invalid_op
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_invalid_op
 	jmp error_code
 	CFI_ENDPROC
 END(invalid_op)
 
 ENTRY(coprocessor_segment_overrun)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_coprocessor_segment_overrun
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_coprocessor_segment_overrun
 	jmp error_code
 	CFI_ENDPROC
 END(coprocessor_segment_overrun)
 
 ENTRY(invalid_TSS)
 	RING0_EC_FRAME
-	pushl $do_invalid_TSS
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_invalid_TSS
 	jmp error_code
 	CFI_ENDPROC
 END(invalid_TSS)
 
 ENTRY(segment_not_present)
 	RING0_EC_FRAME
-	pushl $do_segment_not_present
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_segment_not_present
 	jmp error_code
 	CFI_ENDPROC
 END(segment_not_present)
 
 ENTRY(stack_segment)
 	RING0_EC_FRAME
-	pushl $do_stack_segment
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_stack_segment
 	jmp error_code
 	CFI_ENDPROC
 END(stack_segment)
 
 ENTRY(alignment_check)
 	RING0_EC_FRAME
-	pushl $do_alignment_check
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_alignment_check
 	jmp error_code
 	CFI_ENDPROC
 END(alignment_check)
 
 ENTRY(divide_error)
 	RING0_INT_FRAME
-	pushl $0			# no error code
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_divide_error
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0			# no error code
+	pushl_cfi $do_divide_error
 	jmp error_code
 	CFI_ENDPROC
 END(divide_error)
@@ -1039,10 +983,8 @@ END(divide_error)
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl machine_check_vector
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi machine_check_vector
 	jmp error_code
 	CFI_ENDPROC
 END(machine_check)
@@ -1050,10 +992,8 @@ END(machine_check)
 
 ENTRY(spurious_interrupt_bug)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_spurious_interrupt_bug
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_spurious_interrupt_bug
 	jmp error_code
 	CFI_ENDPROC
 END(spurious_interrupt_bug)
@@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target)
 
 ENTRY(xen_hypervisor_callback)
 	CFI_STARTPROC
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
 	SAVE_ALL
 	TRACE_IRQS_OFF
 
@@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
 # We distinguish between categories by maintaining a status value in EAX.
 ENTRY(xen_failsafe_callback)
 	CFI_STARTPROC
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	movl $1,%eax
 1:	mov 4(%esp),%ds
 2:	mov 8(%esp),%es
 3:	mov 12(%esp),%fs
 4:	mov 16(%esp),%gs
 	testl %eax,%eax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	lea 16(%esp),%esp
 	CFI_ADJUST_CFA_OFFSET -16
 	jz 5f
 	addl $16,%esp
 	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
-5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
-	CFI_ADJUST_CFA_OFFSET 4
+5:	pushl_cfi $0		# EAX == 0 => Category 1 (Bad segment)
 	SAVE_ALL
 	jmp ret_from_exception
 	CFI_ENDPROC
@@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
 
 ENTRY(page_fault)
 	RING0_EC_FRAME
-	pushl $do_page_fault
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_page_fault
 	ALIGN
 error_code:
 	/* the function address is in %gs's slot on the stack */
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %fs
 	/*CFI_REL_OFFSET fs, 0*/
-	pushl %es
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %es
 	/*CFI_REL_OFFSET es, 0*/
-	pushl %ds
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ds
 	/*CFI_REL_OFFSET ds, 0*/
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	CFI_REL_OFFSET eax, 0
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebp
 	CFI_REL_OFFSET ebp, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edx
 	CFI_REL_OFFSET edx, 0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	cld
 	movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1287,9 @@ END(page_fault)
 	movl TSS_sysenter_sp0 + \offset(%esp), %esp
 	CFI_DEF_CFA esp, 0
 	CFI_UNDEFINED eip
-	pushfl
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $__KERNEL_CS
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $sysenter_past_esp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushfl_cfi
+	pushl_cfi $__KERNEL_CS
+	pushl_cfi $sysenter_past_esp
 	CFI_REL_OFFSET eip, 0
 .endm
 
@@ -1377,8 +1299,7 @@ ENTRY(debug)
 	jne debug_stack_correct
 	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
 debug_stack_correct:
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $-1			# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
 	xorl %edx,%edx			# error code 0
@@ -1398,32 +1319,27 @@ END(debug)
  */
 ENTRY(nmi)
 	RING0_INT_FRAME
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	movl %ss, %eax
 	cmpw $__ESPFIX_SS, %ax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	je nmi_espfix_stack
 	cmpl $ia32_sysenter_target,(%esp)
 	je nmi_stack_fixup
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	movl %esp,%eax
 	/* Do not access memory above the end of our stack page,
 	 * it might not exist.
 	 */
 	andl $(THREAD_SIZE-1),%eax
 	cmpl $(THREAD_SIZE-20),%eax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	jae nmi_stack_correct
 	cmpl $ia32_sysenter_target,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
 	/* We have a RING0_INT_FRAME here */
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	SAVE_ALL
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
@@ -1452,18 +1368,14 @@ nmi_espfix_stack:
 	 *
 	 * create the pointer to lss back
 	 */
-	pushl %ss
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl %esp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ss
+	pushl_cfi %esp
 	addl $4, (%esp)
 	/* copy the iret frame of 12 bytes */
 	.rept 3
-	pushl 16(%esp)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi 16(%esp)
 	.endr
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	SAVE_ALL
 	FIXUP_ESPFIX_STACK		# %eax == %esp
 	xorl %edx,%edx			# zero error code
@@ -1477,8 +1389,7 @@ END(nmi)
 
 ENTRY(int3)
 	RING0_INT_FRAME
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $-1			# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
 	xorl %edx,%edx		# zero error code
@@ -1490,8 +1401,7 @@ END(int3)
 
 ENTRY(general_protection)
 	RING0_EC_FRAME
-	pushl $do_general_protection
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_general_protection
 	jmp error_code
 	CFI_ENDPROC
 END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 17be5ec7cbb..a7ae7fd1010 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
 	.macro FAKE_STACK_FRAME child_rip
 	/* push in order ss, rsp, eflags, cs, rip */
 	xorl %eax, %eax
-	pushq $__KERNEL_DS /* ss */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi $__KERNEL_DS /* ss */
 	/*CFI_REL_OFFSET	ss,0*/
-	pushq %rax /* rsp */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rax /* rsp */
 	CFI_REL_OFFSET	rsp,0
-	pushq $X86_EFLAGS_IF /* eflags - interrupts on */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
 	/*CFI_REL_OFFSET	rflags,0*/
-	pushq $__KERNEL_CS /* cs */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi $__KERNEL_CS /* cs */
 	/*CFI_REL_OFFSET	cs,0*/
-	pushq \child_rip /* rip */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi \child_rip /* rip */
 	CFI_REL_OFFSET	rip,0
-	pushq	%rax /* orig rax */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rax /* orig rax */
 	.endm
 
 	.macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
 
 	LOCK ; btr $TIF_FORK,TI_flags(%r8)
 
-	push kernel_eflags(%rip)
-	CFI_ADJUST_CFA_OFFSET 8
-	popf					# reset kernel eflags
-	CFI_ADJUST_CFA_OFFSET -8
+	pushq_cfi kernel_eflags(%rip)
+	popfq_cfi				# reset kernel eflags
 
 	call schedule_tail			# rdi: 'prev' task parameter
 
@@ -521,11 +513,9 @@ sysret_careful:
 	jnc sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rdi
 	call schedule
-	popq  %rdi
-	CFI_ADJUST_CFA_OFFSET -8
+	popq_cfi %rdi
 	jmp sysret_check
 
 	/* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
 	jnc  int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rdi
 	call schedule
-	popq %rdi
-	CFI_ADJUST_CFA_OFFSET -8
+	popq_cfi %rdi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rdi
 	leaq 8(%rsp),%rdi	# &ptregs -> arg1
 	call syscall_trace_leave
-	popq %rdi
-	CFI_ADJUST_CFA_OFFSET -8
+	popq_cfi %rdi
 	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
 	jmp int_restore_rest
 
@@ -714,9 +700,8 @@ END(ptregscall_common)
 
 ENTRY(stub_execve)
 	CFI_STARTPROC
-	popq %r11
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_REGISTER rip, r11
+	addq $8, %rsp
+	PARTIAL_FRAME 0
 	SAVE_REST
 	FIXUP_TOP_OF_STACK %r11
 	movq %rsp, %rcx
@@ -735,7 +720,7 @@ END(stub_execve)
 ENTRY(stub_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	CFI_ADJUST_CFA_OFFSET	-8
+	PARTIAL_FRAME 0
 	SAVE_REST
 	movq %rsp,%rdi
 	FIXUP_TOP_OF_STACK %r11
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .if vector <> FIRST_EXTERNAL_VECTOR
 	CFI_ADJUST_CFA_OFFSET -8
       .endif
-1:	pushq $(~vector+0x80)	/* Note: always in signed byte range */
-	CFI_ADJUST_CFA_OFFSET 8
+1:	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
       .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
 	jmp 2f
       .endif
@@ -796,8 +780,8 @@ END(interrupt)
 
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
-	subq $10*8, %rsp
-	CFI_ADJUST_CFA_OFFSET 10*8
+	subq $ORIG_RAX-ARGOFFSET+8, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
 	call save_args
 	PARTIAL_FRAME 0
 	call \func
@@ -822,6 +806,7 @@ ret_from_intr:
 	TRACE_IRQS_OFF
 	decl PER_CPU_VAR(irq_count)
 	leaveq
+	CFI_RESTORE		rbp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET	-8
 exit_intr:
@@ -903,11 +888,9 @@ retint_careful:
 	jnc   retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rdi
 	call  schedule
-	popq %rdi
-	CFI_ADJUST_CFA_OFFSET	-8
+	popq_cfi %rdi
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -956,8 +939,7 @@ END(common_interrupt)
 .macro apicinterrupt num sym do_sym
 ENTRY(\sym)
 	INTR_FRAME
-	pushq $~(\num)
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi $~(\num)
 	interrupt \do_sym
 	jmp ret_from_intr
 	CFI_ENDPROC
@@ -1023,9 +1005,9 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
-	perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+	irq_work_interrupt smp_irq_work_interrupt
 #endif
 
 /*
@@ -1036,8 +1018,8 @@ ENTRY(\sym)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
-	subq $15*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call error_entry
 	DEFAULT_FRAME 0
 	movq %rsp,%rdi		/* pt_regs pointer */
@@ -1052,9 +1034,9 @@ END(\sym)
 ENTRY(\sym)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq $-1		/* ORIG_RAX: no syscall to restart */
-	CFI_ADJUST_CFA_OFFSET 8
-	subq $15*8, %rsp
+	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	TRACE_IRQS_OFF
 	movq %rsp,%rdi		/* pt_regs pointer */
@@ -1070,9 +1052,9 @@ END(\sym)
 ENTRY(\sym)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq $-1		/* ORIG_RAX: no syscall to restart */
-	CFI_ADJUST_CFA_OFFSET 8
-	subq $15*8, %rsp
+	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	TRACE_IRQS_OFF
 	movq %rsp,%rdi		/* pt_regs pointer */
@@ -1089,8 +1071,8 @@ END(\sym)
 ENTRY(\sym)
 	XCPT_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	subq $15*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call error_entry
 	DEFAULT_FRAME 0
 	movq %rsp,%rdi			/* pt_regs pointer */
@@ -1107,8 +1089,8 @@ END(\sym)
 ENTRY(\sym)
 	XCPT_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	subq $15*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	DEFAULT_FRAME 0
 	TRACE_IRQS_OFF
@@ -1139,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
 	/* edi:  new selector */
 ENTRY(native_load_gs_index)
 	CFI_STARTPROC
-	pushf
-	CFI_ADJUST_CFA_OFFSET 8
+	pushfq_cfi
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
 gs_change:
 	movl %edi,%gs
 2:	mfence		/* workaround */
 	SWAPGS
-	popf
-	CFI_ADJUST_CFA_OFFSET -8
+	popfq_cfi
 	ret
 	CFI_ENDPROC
 END(native_load_gs_index)
@@ -1215,8 +1195,7 @@ END(kernel_execve)
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(call_softirq)
 	CFI_STARTPROC
-	push %rbp
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rbp
 	CFI_REL_OFFSET rbp,0
 	mov  %rsp,%rbp
 	CFI_DEF_CFA_REGISTER rbp
@@ -1225,6 +1204,7 @@ ENTRY(call_softirq)
 	push  %rbp			# backlink for old unwinder
 	call __do_softirq
 	leaveq
+	CFI_RESTORE		rbp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET   -8
 	decl PER_CPU_VAR(irq_count)
@@ -1368,7 +1348,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
 
 	/* ebx:	no swapgs flag */
 ENTRY(paranoid_exit)
-	INTR_FRAME
+	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl %ebx,%ebx				/* swapgs needed? */
@@ -1445,7 +1425,6 @@ error_swapgs:
 error_sti:
 	TRACE_IRQS_OFF
 	ret
-	CFI_ENDPROC
 
 /*
  * There are two places in the kernel that can potentially fault with
@@ -1470,6 +1449,7 @@ bstep_iret:
 	/* Fix truncated RIP */
 	movq %rcx,RIP+8(%rsp)
 	jmp error_swapgs
+	CFI_ENDPROC
 END(error_entry)
 
 
@@ -1498,8 +1478,8 @@ ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq_cfi $-1
-	subq $15*8, %rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	DEFAULT_FRAME 0
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54e..3afb33f14d2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 	return mod_code_status;
 }
 
-
-
-
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
-
 static unsigned char *ftrace_nop_replace(void)
 {
-	return ftrace_nop;
+	return ideal_nop5;
 }
 
 static int
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	extern const unsigned char ftrace_test_p6nop[];
-	extern const unsigned char ftrace_test_nop5[];
-	extern const unsigned char ftrace_test_jmp[];
-	int faulted = 0;
-
-	/*
-	 * There is no good nop for all x86 archs.
-	 * We will default to using the P6_NOP5, but first we
-	 * will test to make sure that the nop will actually
-	 * work on this CPU. If it faults, we will then
-	 * go to a lesser efficient 5 byte nop. If that fails
-	 * we then just use a jmp as our nop. This isn't the most
-	 * efficient nop, but we can not use a multi part nop
-	 * since we would then risk being preempted in the middle
-	 * of that nop, and if we enabled tracing then, it might
-	 * cause a system crash.
-	 *
-	 * TODO: check the cpuid to determine the best nop.
-	 */
-	asm volatile (
-		"ftrace_test_jmp:"
-		"jmp ftrace_test_p6nop\n"
-		"nop\n"
-		"nop\n"
-		"nop\n"  /* 2 byte jmp + 3 bytes */
-		"ftrace_test_p6nop:"
-		P6_NOP5
-		"jmp 1f\n"
-		"ftrace_test_nop5:"
-		".byte 0x66,0x66,0x66,0x66,0x90\n"
-		"1:"
-		".section .fixup, \"ax\"\n"
-		"2:	movl $1, %0\n"
-		"	jmp ftrace_test_nop5\n"
-		"3:	movl $2, %0\n"
-		"	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(ftrace_test_p6nop, 2b)
-		_ASM_EXTABLE(ftrace_test_nop5, 3b)
-		: "=r"(faulted) : "0" (faulted));
-
-	switch (faulted) {
-	case 0:
-		pr_info("converting mcount calls to 0f 1f 44 00 00\n");
-		memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
-		break;
-	case 1:
-		pr_info("converting mcount calls to 66 66 66 66 90\n");
-		memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
-		break;
-	case 2:
-		pr_info("converting mcount calls to jmp . + 5\n");
-		memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
-		break;
-	}
-
 	/* The return code is retured via data */
 	*(unsigned long *)data = 0;
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0..58bb239a2fd 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
 	 */
 
 	if (!HAVE_HWFP) {
+		/*
+		 * Disable xsave as we do not support it if i387
+		 * emulation is enabled.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
 		xstate_size = sizeof(struct i387_soft_struct);
 		return;
 	}
 
 	if (cpu_has_fxsr)
 		xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
 	else
 		xstate_size = sizeof(struct i387_fsave_struct);
-#endif
 }
 
-#ifdef CONFIG_X86_64
 /*
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
 
 void __cpuinit fpu_init(void)
 {
-	unsigned long oldcr0 = read_cr0();
-
-	set_in_cr4(X86_CR4_OSFXSR);
-	set_in_cr4(X86_CR4_OSXMMEXCPT);
+	unsigned long cr0;
+	unsigned long cr4_mask = 0;
 
-	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+	if (cpu_has_fxsr)
+		cr4_mask |= X86_CR4_OSFXSR;
+	if (cpu_has_xmm)
+		cr4_mask |= X86_CR4_OSXMMEXCPT;
+	if (cr4_mask)
+		set_in_cr4(cr4_mask);
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+	if (!HAVE_HWFP)
+		cr0 |= X86_CR0_EM;
+	write_cr0(cr0);
 
 	if (!smp_processor_id())
 		init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
 	clear_used_math();
 }
 
-#else	/* CONFIG_X86_64 */
-
-void __cpuinit fpu_init(void)
-{
-	if (!smp_processor_id())
-		init_thread_xstate();
-}
-
-#endif	/* CONFIG_X86_32 */
-
 void fpu_finit(struct fpu *fpu)
 {
-#ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
 		finit_soft_fpu(&fpu->state->soft);
 		return;
 	}
-#endif
 
 	if (cpu_has_fxsr) {
 		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 #ifdef CONFIG_X86_64
 	env->fip = fxsave->rip;
 	env->foo = fxsave->rdp;
+	/*
+	 * should be actually ds/cs at fpu exception time, but
+	 * that information is not available in 64bit mode.
+	 */
+	env->fcs = task_pt_regs(tsk)->cs;
 	if (tsk == current) {
-		/*
-		 * should be actually ds/cs at fpu exception time, but
-		 * that information is not available in 64bit mode.
-		 */
-		asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
-		asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
+		savesegment(ds, env->fos);
 	} else {
-		struct pt_regs *regs = task_pt_regs(tsk);
-
-		env->fos = 0xffff0000 | tsk->thread.ds;
-		env->fcs = regs->cs;
+		env->fos = tsk->thread.ds;
 	}
+	env->fos |= 0xffff0000;
 #else
 	env->fip = fxsave->fip;
 	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18..44edb03fc9e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance monitoring interrupts\n");
-	seq_printf(p, "%*s: ", prec, "PND");
+	seq_printf(p, "%*s: ", prec, "IWI");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
-	seq_printf(p, "  Performance pending work\n");
+		seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+	seq_printf(p, "  IRQ work interrupts\n");
 #endif
 	if (x86_platform_ipi_callback) {
 		seq_printf(p, "%*s: ", prec, "PLT");
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
-	sum += irq_stats(cpu)->apic_pending_irqs;
+	sum += irq_stats(cpu)->apic_irq_work_irqs;
 #endif
 	if (x86_platform_ipi_callback)
 		sum += irq_stats(cpu)->x86_platform_ipis;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 00000000000..ca8f703a1e7
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_work_run();
+	irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!cpu_has_apic)
+		return;
+
+	apic->send_IPI_self(IRQ_WORK_VECTOR);
+	apic_wait_icr_idle();
+#endif
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc57..713969b9266 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -224,9 +224,9 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
-	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
-	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+	/* IRQ work interrupts: */
+# ifdef CONFIG_IRQ_WORK
+	alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
 # endif
 
 #endif
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 00000000000..961b6b30ba9
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,50 @@
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+union jump_code_union {
+	char code[JUMP_LABEL_NOP_SIZE];
+	struct {
+		char jump;
+		int offset;
+	} __attribute__((packed));
+};
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	union jump_code_union code;
+
+	if (type == JUMP_LABEL_ENABLE) {
+		code.jump = 0xe9;
+		code.offset = entry->target -
+				(entry->code + JUMP_LABEL_NOP_SIZE);
+	} else
+		memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+	get_online_cpus();
+	mutex_lock(&text_mutex);
+	text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+	mutex_unlock(&text_mutex);
+	put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+	text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+}
+
+#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 770ebfb349e..1cbd54c0df9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 	return 0;
 }
 
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
 /* Check if paddr is at an instruction boundary */
 static int __kprobes can_probe(unsigned long paddr)
 {
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
 
-	if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+	if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
 		return 0;
 
 	/* Decode instructions */
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
 	*(unsigned long *)addr = val;
 }
 
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
 {
 	asm volatile (
 			".global optprobe_template_entry\n"
@@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
 	}
 	/* Check whether the address range is reserved */
 	if (ftrace_text_reserved(src, src + len - 1) ||
-	    alternatives_text_reserved(src, src + len - 1))
+	    alternatives_text_reserved(src, src + len - 1) ||
+	    jump_label_text_reserved(src, src + len - 1))
 		return -EBUSY;
 
 	return len;
@@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr)
 	unsigned long addr, size = 0, offset = 0;
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
-	/* Dummy buffers for lookup_symbol_attrs */
-	static char __dummy_buf[KSYM_NAME_LEN];
 
 	/* Lookup symbol including addr */
-	if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 		return 0;
 
 	/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c52918..b3ea9db39db 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 		if (!page)
 			goto out;
 		pud = (pud_t *)page_address(page);
-		memset(pud, 0, PAGE_SIZE);
+		clear_page(pud);
 		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
 	}
 	pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 		if (!page)
 			goto out;
 		pmd = (pmd_t *)page_address(page);
-		memset(pmd, 0, PAGE_SIZE);
+		clear_page(pmd);
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 	}
 	pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 1c355c55096..8f295609173 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr,
 		apply_paravirt(pseg, pseg + para->sh_size);
 	}
 
+	/* make jump label nops */
+	jump_label_apply_nops(me);
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c
new file mode 100644
index 00000000000..f5442c03abc
--- /dev/null
+++ b/arch/x86/kernel/olpc-xo1.c
@@ -0,0 +1,140 @@
+/*
+ * Support for features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1"
+
+#define PMS_BAR		4
+#define ACPI_BAR	5
+
+/* PMC registers (PMS block) */
+#define PM_SCLK		0x10
+#define PM_IN_SLPCTL	0x20
+#define PM_WKXD		0x34
+#define PM_WKD		0x30
+#define PM_SSC		0x54
+
+/* PM registers (ACPI block) */
+#define PM1_CNT		0x08
+#define PM_GPE0_STS	0x18
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static void xo1_power_off(void)
+{
+	printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+	/* Enable all of these controls with 0 delay */
+	outl(0x40000000, pms_base + PM_SCLK);
+	outl(0x40000000, pms_base + PM_IN_SLPCTL);
+	outl(0x40000000, pms_base + PM_WKXD);
+	outl(0x40000000, pms_base + PM_WKD);
+
+	/* Clear status bits (possibly unnecessary) */
+	outl(0x0002ffff, pms_base  + PM_SSC);
+	outl(0xffffffff, acpi_base + PM_GPE0_STS);
+
+	/* Write SLP_EN bit to start the machinery */
+	outl(0x00002000, acpi_base + PM1_CNT);
+}
+
+/* Read the base addresses from the PCI BAR info */
+static int __devinit setup_bases(struct pci_dev *pdev)
+{
+	int r;
+
+	r = pci_enable_device_io(pdev);
+	if (r) {
+		dev_err(&pdev->dev, "can't enable device IO\n");
+		return r;
+	}
+
+	r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
+	if (r) {
+		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
+		return r;
+	}
+
+	r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
+	if (r) {
+		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
+		pci_release_region(pdev, ACPI_BAR);
+		return r;
+	}
+
+	acpi_base = pci_resource_start(pdev, ACPI_BAR);
+	pms_base = pci_resource_start(pdev, PMS_BAR);
+
+	return 0;
+}
+
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+{
+	struct pci_dev *pcidev;
+	int r;
+
+	pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
+				NULL);
+	if (!pdev)
+		return -ENODEV;
+
+	r = setup_bases(pcidev);
+	if (r)
+		return r;
+
+	pm_power_off = xo1_power_off;
+
+	printk(KERN_INFO "OLPC XO-1 support registered\n");
+	return 0;
+}
+
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
+{
+	pm_power_off = NULL;
+	return 0;
+}
+
+static struct platform_driver olpc_xo1_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.owner = THIS_MODULE,
+	},
+	.probe = olpc_xo1_probe,
+	.remove = __devexit_p(olpc_xo1_remove),
+};
+
+static int __init olpc_xo1_init(void)
+{
+	return platform_driver_register(&olpc_xo1_driver);
+}
+
+static void __exit olpc_xo1_exit(void)
+{
+	platform_driver_unregister(&olpc_xo1_driver);
+}
+
+MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:olpc-xo1");
+
+module_init(olpc_xo1_init);
+module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 0e0cdde519b..edaf3fe8dc5 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/string.h>
+#include <linux/platform_device.h>
 
 #include <asm/geode.h>
 #include <asm/setup.h>
@@ -114,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
 	unsigned long flags;
 	int ret = -EIO;
 	int i;
+	int restarts = 0;
 
 	spin_lock_irqsave(&ec_lock, flags);
 
@@ -169,7 +171,9 @@ restart:
 			if (wait_on_obf(0x6c, 1)) {
 				printk(KERN_ERR "olpc-ec:  timeout waiting for"
 						" EC to provide data!\n");
-				goto restart;
+				if (restarts++ < 10)
+					goto restart;
+				goto err;
 			}
 			outbuf[i] = inb(0x68);
 			pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
@@ -183,8 +187,21 @@ err:
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
-#ifdef CONFIG_OLPC_OPENFIRMWARE
-static void __init platform_detect(void)
+static bool __init check_ofw_architecture(void)
+{
+	size_t propsize;
+	char olpc_arch[5];
+	const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
+	void *res[] = { &propsize };
+
+	if (olpc_ofw("getprop", args, res)) {
+		printk(KERN_ERR "ofw: getprop call failed!\n");
+		return false;
+	}
+	return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(void)
 {
 	size_t propsize;
 	__be32 rev;
@@ -193,45 +210,43 @@ static void __init platform_detect(void)
 
 	if (olpc_ofw("getprop", args, res) || propsize != 4) {
 		printk(KERN_ERR "ofw: getprop call failed!\n");
-		rev = cpu_to_be32(0);
+		return cpu_to_be32(0);
 	}
-	olpc_platform_info.boardrev = be32_to_cpu(rev);
+	return be32_to_cpu(rev);
 }
-#else
-static void __init platform_detect(void)
+
+static bool __init platform_detect(void)
 {
-	/* stopgap until OFW support is added to the kernel */
-	olpc_platform_info.boardrev = olpc_board(0xc2);
+	if (!check_ofw_architecture())
+		return false;
+	olpc_platform_info.flags |= OLPC_F_PRESENT;
+	olpc_platform_info.boardrev = get_board_revision();
+	return true;
 }
-#endif
 
-static int __init olpc_init(void)
+static int __init add_xo1_platform_devices(void)
 {
-	unsigned char *romsig;
+	struct platform_device *pdev;
 
-	/* The ioremap check is dangerous; limit what we run it on */
-	if (!is_geode() || cs5535_has_vsa2())
-		return 0;
+	pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	spin_lock_init(&ec_lock);
+	pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	romsig = ioremap(0xffffffc0, 16);
-	if (!romsig)
-		return 0;
+	return 0;
+}
 
-	if (strncmp(romsig, "CL1   Q", 7))
-		goto unmap;
-	if (strncmp(romsig+6, romsig+13, 3)) {
-		printk(KERN_INFO "OLPC BIOS signature looks invalid.  "
-				"Assuming not OLPC\n");
-		goto unmap;
-	}
+static int __init olpc_init(void)
+{
+	int r = 0;
 
-	printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig);
-	olpc_platform_info.flags |= OLPC_F_PRESENT;
+	if (!olpc_ofw_present() || !platform_detect())
+		return 0;
 
-	/* get the platform revision */
-	platform_detect();
+	spin_lock_init(&ec_lock);
 
 	/* assume B1 and above models always have a DCON */
 	if (olpc_board_at_least(olpc_board(0xb1)))
@@ -242,8 +257,10 @@ static int __init olpc_init(void)
 			(unsigned char *) &olpc_platform_info.ecver, 1);
 
 #ifdef CONFIG_PCI_OLPC
-	/* If the VSA exists let it emulate PCI, if not emulate in kernel */
-	if (!cs5535_has_vsa2())
+	/* If the VSA exists let it emulate PCI, if not emulate in kernel.
+	 * XO-1 only. */
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+			!cs5535_has_vsa2())
 		x86_init.pci.arch_init = pci_olpc_init;
 #endif
 
@@ -252,8 +269,12 @@ static int __init olpc_init(void)
 			olpc_platform_info.boardrev >> 4,
 			olpc_platform_info.ecver);
 
-unmap:
-	iounmap(romsig);
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
+		r = add_xo1_platform_devices();
+		if (r)
+			return r;
+	}
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index 3218aa71ab5..78732046437 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
 }
 EXPORT_SYMBOL_GPL(__olpc_ofw);
 
+bool olpc_ofw_present(void)
+{
+	return olpc_ofw_cif != NULL;
+}
+EXPORT_SYMBOL_GPL(olpc_ofw_present);
+
 /* OFW cif _should_ be above this address */
 #define OFW_MIN 0xff000000
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c0..c5b250011fd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 
 	.alloc_pte = paravirt_nop,
 	.alloc_pmd = paravirt_nop,
-	.alloc_pmd_clone = paravirt_nop,
 	.alloc_pud = paravirt_nop,
 	.release_pte = paravirt_nop,
 	.release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa6..c562207b1b3 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,7 +39,7 @@
 #include <asm/cacheflush.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/x86_init.h>
 
 static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
@@ -560,8 +560,11 @@ static void enable_gart_translations(void)
 {
 	int i;
 
-	for (i = 0; i < num_k8_northbridges; i++) {
-		struct pci_dev *dev = k8_northbridges[i];
+	if (!k8_northbridges.gart_supported)
+		return;
+
+	for (i = 0; i < k8_northbridges.num; i++) {
+		struct pci_dev *dev = k8_northbridges.nb_misc[i];
 
 		enable_gart_translation(dev, __pa(agp_gatt_table));
 	}
@@ -592,16 +595,19 @@ static void gart_fixup_northbridges(struct sys_device *dev)
 	if (!fix_up_north_bridges)
 		return;
 
+	if (!k8_northbridges.gart_supported)
+		return;
+
 	pr_info("PCI-DMA: Restoring GART aperture settings\n");
 
-	for (i = 0; i < num_k8_northbridges; i++) {
-		struct pci_dev *dev = k8_northbridges[i];
+	for (i = 0; i < k8_northbridges.num; i++) {
+		struct pci_dev *dev = k8_northbridges.nb_misc[i];
 
 		/*
 		 * Don't enable translations just yet.  That is the next
 		 * step.  Restore the pre-suspend aperture settings.
 		 */
-		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+		gart_set_size_and_enable(dev, aperture_order);
 		pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
 	}
 }
@@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 	aper_size = aper_base = info->aper_size = 0;
 	dev = NULL;
-	for (i = 0; i < num_k8_northbridges; i++) {
-		dev = k8_northbridges[i];
+	for (i = 0; i < k8_northbridges.num; i++) {
+		dev = k8_northbridges.nb_misc[i];
 		new_aper_base = read_aperture(dev, &new_aper_size);
 		if (!new_aper_base)
 			goto nommu;
@@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void)
 	if (!no_agp)
 		return;
 
-	for (i = 0; i < num_k8_northbridges; i++) {
+	if (!k8_northbridges.gart_supported)
+		return;
+
+	for (i = 0; i < k8_northbridges.num; i++) {
 		u32 ctl;
 
-		dev = k8_northbridges[i];
+		dev = k8_northbridges.nb_misc[i];
 		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
 		ctl &= ~GARTEN;
@@ -739,7 +748,7 @@ int __init gart_iommu_init(void)
 	unsigned long scratch;
 	long i;
 
-	if (num_k8_northbridges == 0)
+	if (!k8_northbridges.gart_supported)
 		return 0;
 
 #ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f199..00000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
-	u32 a, b;
-	for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
-	     a == b;
-	     b = inl(pmtmr_ioport) & ACPI_PM_MASK)
-		cpu_relax();
-	return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
-	u32 a, b;
-	a = pmtimer_wait_tick();
-	do {
-		b = inl(pmtmr_ioport);
-		cpu_relax();
-	} while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
-	pmtmr_ioport = 0;
-	return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd..b3d7a3a04f3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	load_TLS(next, cpu);
 
 	/* Must be after DS reload */
-	unlazy_fpu(prev_p);
+	__unlazy_fpu(prev_p);
 
 	/* Make sure cpu is ready for new context */
 	if (preload_fpu)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83..7a4cf14223b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -84,7 +84,7 @@ static int __init reboot_setup(char *str)
 			}
 				/* we will leave sorting out the final value
 				   when we are ready to reboot, since we might not
-				   have set up boot_cpu_id or smp_num_cpu */
+				   have detected BSP APIC ID or smp_num_cpu */
 			break;
 #endif /* CONFIG_SMP */
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b99..a59f6a6df5e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -83,7 +83,6 @@
 #include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
-#include <asm/vmi.h>
 #include <asm/setup_arch.h>
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
@@ -107,11 +106,12 @@
 #include <asm/percpu.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
 #include <asm/mce.h>
+#include <asm/alternative.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -125,7 +125,6 @@ unsigned long max_pfn_mapped;
 RESERVE_BRK(dmi_alloc, 65536);
 #endif
 
-unsigned int boot_cpu_id __read_mostly;
 
 static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
 unsigned long _brk_end = (unsigned long)__brk_base;
@@ -618,79 +617,7 @@ static __init void reserve_ibft_region(void)
 		reserve_early_overlap_ok(addr, addr + size, "ibft");
 }
 
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
-{
-	printk(KERN_NOTICE
-		"%s detected: BIOS may corrupt low RAM, working around it.\n",
-		d->ident);
-
-	e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
-	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-
-	return 0;
-}
-#endif
-
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "AMI BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
-		},
-	},
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "Phoenix BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
-		},
-	},
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "Phoenix/MSC BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
-		},
-	},
-	/*
-	 * AMI BIOS with low memory corruption was found on Intel DG45ID and
-	 * DG45FC boards.
-	 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
-	 * match only DMI_BOARD_NAME and see if there is more bad products
-	 * with this vendor.
-	 */
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "AMI BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
-		},
-	},
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "AMI BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
-		},
-	},
-	/*
-	 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
-	 * match on the product name.
-	 */
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "Phoenix BIOS",
-		.matches = {
-			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
-		},
-	},
-#endif
-	{}
-};
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
 
 static void __init trim_bios_range(void)
 {
@@ -698,8 +625,14 @@ static void __init trim_bios_range(void)
 	 * A special case is the first 4Kb of memory;
 	 * This is a BIOS owned area, not kernel ram, but generally
 	 * not listed as such in the E820 table.
+	 *
+	 * This typically reserves additional memory (64KiB by default)
+	 * since some BIOSes are known to corrupt low memory.  See the
+	 * Kconfig help text for X86_RESERVE_LOW.
 	 */
-	e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+	e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+			  E820_RAM, E820_RESERVED);
+
 	/*
 	 * special case: Some BIOSen report the PC BIOS
 	 * area (640->1Mb) as ram even though it is not.
@@ -709,6 +642,28 @@ static void __init trim_bios_range(void)
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
 
+static int __init parse_reservelow(char *p)
+{
+	unsigned long long size;
+
+	if (!p)
+		return -EINVAL;
+
+	size = memparse(p, &p);
+
+	if (size < 4096)
+		size = 4096;
+
+	if (size > 640*1024)
+		size = 640*1024;
+
+	reserve_low = size;
+
+	return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -726,6 +681,7 @@ void __init setup_arch(char **cmdline_p)
 {
 	int acpi = 0;
 	int k8 = 0;
+	unsigned long flags;
 
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -734,10 +690,10 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
 
-	/* VMI may relocate the fixmap; do this before touching ioremap area */
-	vmi_init();
-
-	/* OFW also may relocate the fixmap */
+	/*
+	 * If we have OLPC OFW, we might end up relocating the fixmap due to
+	 * reserve_top(), so do this before touching the ioremap area.
+	 */
 	olpc_ofw_detect();
 
 	early_trap_init();
@@ -838,9 +794,6 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_report_nx();
 
-	/* Must be before kernel pagetables are setup */
-	vmi_activate();
-
 	/* after early param, so could get panic from serial */
 	reserve_early_setup_data();
 
@@ -863,8 +816,6 @@ void __init setup_arch(char **cmdline_p)
 
 	dmi_scan_machine();
 
-	dmi_check_system(bad_bios_dmi_table);
-
 	/*
 	 * VMware detection requires dmi to be available, so this
 	 * needs to be done after dmi_scan_machine, for the BP.
@@ -1071,6 +1022,10 @@ void __init setup_arch(char **cmdline_p)
 	x86_init.oem.banner();
 
 	mcheck_init();
+
+	local_irq_save(flags);
+	arch_init_ideal_nop5();
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae645..2335c15c93a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void)
 		 * Up to this point, the boot CPU has been using .init.data
 		 * area.  Reload any changed state for the boot CPU.
 		 */
-		if (cpu == boot_cpu_id)
+		if (!cpu)
 			switch_to_new_gdt(cpu);
 	}
 
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index cb22acf3ed0..dd4c281ffe5 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -34,7 +34,7 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 
-void __init mp_sfi_register_lapic_address(unsigned long address)
+static void __init mp_sfi_register_lapic_address(unsigned long address)
 {
 	mp_lapic_addr = address;
 
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
 }
 
 /* All CPUs enumerated by SFI must be present and enabled */
-void __cpuinit mp_sfi_register_lapic(u8 id)
+static void __cpuinit mp_sfi_register_lapic(u8 id)
 {
 	if (MAX_APICS - id <= 0) {
 		pr_warning("Processor #%d invalid (max %d)\n",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd70..2ced73ba048 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
@@ -311,7 +311,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	__flush_tlb_all();
 #endif
 
-	vmi_bringup();
 	cpu_init();
 	preempt_disable();
 	smp_callin();
@@ -397,6 +396,19 @@ void __cpuinit smp_store_cpu_info(int id)
 		identify_secondary_cpu(c);
 }
 
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+	struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
+	struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
+
+	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+	cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+	cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+	cpumask_set_cpu(cpu1, c2->llc_shared_map);
+	cpumask_set_cpu(cpu2, c1->llc_shared_map);
+}
+
 
 void __cpuinit set_cpu_sibling_map(int cpu)
 {
@@ -409,14 +421,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		for_each_cpu(i, cpu_sibling_setup_mask) {
 			struct cpuinfo_x86 *o = &cpu_data(i);
 
-			if (c->phys_proc_id == o->phys_proc_id &&
-			    c->cpu_core_id == o->cpu_core_id) {
-				cpumask_set_cpu(i, cpu_sibling_mask(cpu));
-				cpumask_set_cpu(cpu, cpu_sibling_mask(i));
-				cpumask_set_cpu(i, cpu_core_mask(cpu));
-				cpumask_set_cpu(cpu, cpu_core_mask(i));
-				cpumask_set_cpu(i, c->llc_shared_map);
-				cpumask_set_cpu(cpu, o->llc_shared_map);
+			if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+				if (c->phys_proc_id == o->phys_proc_id &&
+				    c->compute_unit_id == o->compute_unit_id)
+					link_thread_siblings(cpu, i);
+			} else if (c->phys_proc_id == o->phys_proc_id &&
+				   c->cpu_core_id == o->cpu_core_id) {
+				link_thread_siblings(cpu, i);
 			}
 		}
 	} else {
@@ -1109,8 +1120,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	}
 	set_cpu_sibling_map(0);
 
-	enable_IR_x2apic();
-	default_setup_apic_routing();
 
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1127,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		goto out;
 	}
 
+	default_setup_apic_routing();
+
 	preempt_disable();
 	if (read_apic_id() != boot_cpu_physical_apicid) {
 		panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1383,11 +1394,88 @@ void play_dead_common(void)
 	local_irq_disable();
 }
 
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	unsigned int highest_cstate = 0;
+	unsigned int highest_subcstate = 0;
+	int i;
+	void *mwait_ptr;
+
+	if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
+		return;
+	if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
+		return;
+	if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return;
+
+	eax = CPUID_MWAIT_LEAF;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	/*
+	 * eax will be 0 if EDX enumeration is not valid.
+	 * Initialized below to cstate, sub_cstate value when EDX is valid.
+	 */
+	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+		eax = 0;
+	} else {
+		edx >>= MWAIT_SUBSTATE_SIZE;
+		for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+			if (edx & MWAIT_SUBSTATE_MASK) {
+				highest_cstate = i;
+				highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+			}
+		}
+		eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+			(highest_subcstate - 1);
+	}
+
+	/*
+	 * This should be a memory location in a cache line which is
+	 * unlikely to be touched by other processors.  The actual
+	 * content is immaterial as it is not actually modified in any way.
+	 */
+	mwait_ptr = &current_thread_info()->flags;
+
+	wbinvd();
+
+	while (1) {
+		/*
+		 * The CLFLUSH is a workaround for erratum AAI65 for
+		 * the Xeon 7400 series.  It's not clear it is actually
+		 * needed, but it should be harmless in either case.
+		 * The WBINVD is insufficient due to the spurious-wakeup
+		 * case where we return around the loop.
+		 */
+		clflush(mwait_ptr);
+		__monitor(mwait_ptr, 0, 0);
+		mb();
+		__mwait(eax, 0);
+	}
+}
+
+static inline void hlt_play_dead(void)
+{
+	if (current_cpu_data.x86 >= 4)
+		wbinvd();
+
+	while (1) {
+		native_halt();
+	}
+}
+
 void native_play_dead(void)
 {
 	play_dead_common();
 	tboot_shutdown(TB_SHUTDOWN_WFS);
-	wbinvd_halt();
+
+	mwait_play_dead();	/* Only returns on failure */
+	hlt_play_dead();
 }
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34..0b0cb5fede1 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
 		  const char *const envp[])
 {
 	long __res;
-	asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+	asm volatile ("int $0x80"
 	: "=a" (__res)
-	: "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
+	: "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
 	return __res;
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8..d43968503dd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void)
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
-	printk(KERN_EMERG
-		"math-emulation not enabled and no coprocessor found.\n");
-	printk(KERN_EMERG "killing %s.\n", current->comm);
-	force_sig(SIGFPE, current);
-	schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
-
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
 
@@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 		info.regs = regs;
 		math_emulate(&info);
-	} else {
-		math_state_restore(); /* interrupts still off */
-		conditional_sti(regs);
+		return;
 	}
-#else
-	math_state_restore();
+#endif
+	math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+	conditional_sti(regs);
 #endif
 }
 
@@ -881,18 +870,6 @@ void __init trap_init(void)
 #endif
 
 #ifdef CONFIG_X86_32
-	if (cpu_has_fxsr) {
-		printk(KERN_INFO "Enabling fast FPU save and restore... ");
-		set_in_cr4(X86_CR4_OSFXSR);
-		printk("done.\n");
-	}
-	if (cpu_has_xmm) {
-		printk(KERN_INFO
-			"Enabling unmasked SIMD FPU exception support... ");
-		set_in_cr4(X86_CR4_OSXMMEXCPT);
-		printk("done.\n");
-	}
-
 	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a..0c40d8b7241 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
 
 __setup("notsc", notsc_setup);
 
+static int no_sched_irq_time;
+
 static int __init tsc_setup(char *str)
 {
 	if (!strcmp(str, "reliable"))
 		tsc_clocksource_reliable = 1;
+	if (!strncmp(str, "noirqtime", 9))
+		no_sched_irq_time = 1;
 	return 1;
 }
 
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		sched_clock_stable = 0;
+		disable_sched_clock_irqtime();
 		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
@@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void)
 	clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
-#ifdef CONFIG_X86_64
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
-	int tsc_start, tsc_now;
-	int i, no_ctr_free;
-	unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-	unsigned long flags;
-
-	for (i = 0; i < 4; i++)
-		if (avail_to_resrv_perfctr_nmi_bit(i))
-			break;
-	no_ctr_free = (i == 4);
-	if (no_ctr_free) {
-		WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
-		     "cpu_khz value may be incorrect.\n");
-		i = 3;
-		rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-		wrmsrl(MSR_K7_EVNTSEL3, 0);
-		rdmsrl(MSR_K7_PERFCTR3, pmc3);
-	} else {
-		reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-	local_irq_save(flags);
-	/* start measuring cycles, incrementing from 0 */
-	wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-	wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-	rdtscl(tsc_start);
-	do {
-		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-		tsc_now = get_cycles();
-	} while ((tsc_now - tsc_start) < TICK_COUNT);
-
-	local_irq_restore(flags);
-	if (no_ctr_free) {
-		wrmsrl(MSR_K7_EVNTSEL3, 0);
-		wrmsrl(MSR_K7_PERFCTR3, pmc3);
-		wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-	} else {
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-
-	return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
-
 void __init tsc_init(void)
 {
 	u64 lpj;
@@ -964,10 +915,6 @@ void __init tsc_init(void)
 		return;
 	}
 
-	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
-		cpu_khz = calibrate_cpu();
-
 	printk("Detected %lu.%03lu MHz processor.\n",
 			(unsigned long)cpu_khz / 1000,
 			(unsigned long)cpu_khz % 1000);
@@ -987,6 +934,9 @@ void __init tsc_init(void)
 	/* now allow native_sched_clock() to use rdtsc */
 	tsc_disabled = 0;
 
+	if (!no_sched_irq_time)
+		enable_sched_clock_irqtime();
+
 	lpj = ((u64)tsc_khz * 1000);
 	do_div(lpj, HZ);
 	lpj_fine = lpj;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb752..00000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
-   (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
-   (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
-	void (*cpuid)(void /* non-c */);
-	void (*_set_ldt)(u32 selector);
-	void (*set_tr)(u32 selector);
-	void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
-	void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
-	void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
-	void (*set_kernel_stack)(u32 selector, u32 sp0);
-	void (*allocate_page)(u32, u32, u32, u32, u32);
-	void (*release_page)(u32, u32);
-	void (*set_pte)(pte_t, pte_t *, unsigned);
-	void (*update_pte)(pte_t *, unsigned);
-	void (*set_linear_mapping)(int, void *, u32, u32);
-	void (*_flush_tlb)(int);
-	void (*set_initial_ap_state)(int, int);
-	void (*halt)(void);
-  	void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP  0xe9
-#define MNEM_RET  0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE  5
-
-static inline void patch_offset(void *insnbuf,
-				unsigned long ip, unsigned long dest)
-{
-        *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-			       unsigned long ip)
-{
-	u64 reloc;
-	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,	call);
-	switch(rel->type) {
-		case VMI_RELOCATION_CALL_REL:
-			BUG_ON(len < 5);
-			*(char *)insnbuf = MNEM_CALL;
-			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-			return 5;
-
-		case VMI_RELOCATION_JUMP_REL:
-			BUG_ON(len < 5);
-			*(char *)insnbuf = MNEM_JMP;
-			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-			return 5;
-
-		case VMI_RELOCATION_NOP:
-			/* obliterate the whole thing */
-			return 0;
-
-		case VMI_RELOCATION_NONE:
-			/* leave native code in place */
-			break;
-
-		default:
-			BUG();
-	}
-	return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence.  The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-			  unsigned long ip, unsigned len)
-{
-	switch (type) {
-		case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
-			return patch_internal(VMI_CALL_DisableInterrupts, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
-			return patch_internal(VMI_CALL_EnableInterrupts, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
-			return patch_internal(VMI_CALL_SetInterruptMask, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_irq_ops.save_fl):
-			return patch_internal(VMI_CALL_GetInterruptMask, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_cpu_ops.iret):
-			return patch_internal(VMI_CALL_IRET, len, insns, ip);
-		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
-			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
-		default:
-			break;
-	}
-	return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
-                               unsigned int *cx, unsigned int *dx)
-{
-	int override = 0;
-	if (*ax == 1)
-		override = 1;
-        asm volatile ("call *%6"
-                      : "=a" (*ax),
-                        "=b" (*bx),
-                        "=c" (*cx),
-                        "=d" (*dx)
-                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
-	if (override) {
-		if (disable_pse)
-			*dx &= ~X86_FEATURE_PSE;
-		if (disable_pge)
-			*dx &= ~X86_FEATURE_PGE;
-		if (disable_sep)
-			*dx &= ~X86_FEATURE_SEP;
-		if (disable_tsc)
-			*dx &= ~X86_FEATURE_TSC;
-		if (disable_mtrr)
-			*dx &= ~X86_FEATURE_MTRR;
-	}
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
-	if (gdt[nr].a != new->a || gdt[nr].b != new->b)
-		write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
-	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
-	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
-	unsigned cpu = smp_processor_id();
-	struct desc_struct desc;
-
-	pack_descriptor(&desc, (unsigned long)addr,
-			entries * sizeof(struct desc_struct) - 1,
-			DESC_LDT, 0);
-	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
-	vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
-	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
-	u32 *idt_entry = (u32 *)g;
-	vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
-				const void *desc, int type)
-{
-	u32 *gdt_entry = (u32 *)desc;
-	vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
-				const void *desc)
-{
-	u32 *ldt_entry = (u32 *)desc;
-	vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
-				   struct thread_struct *thread)
-{
-	tss->x86_tss.sp0 = thread->sp0;
-
-	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
-	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-		tss->x86_tss.ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
-	vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
-	vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
-	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- 	/*
-	 * This call comes in very early, before mem_map is setup.
-	 * It is called only for swapper_pg_dir, which already has
-	 * data on it.
-	 */
-	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
-	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
-	vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
-	vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
-	vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags.  We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush).  We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs.  We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
-                                       (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user)                           \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user)                     \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
-	/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
-	vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
-	const pte_t pte = { .pte = pmdval.pmd };
-#else
-	const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
-	vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
-	/*
-	 * XXX This is called from set_pmd_pte, but at both PT
-	 * and PD layers so the VMI_PAGE_PT flag is wrong.  But
-	 * it is only called for large page mapping changes,
-	 * the Xen backend, doesn't support large pages, and the
-	 * ESX backend doesn't depend on the flag.
-	 */
-	set_64bit((unsigned long long *)ptep,pte_val(pteval));
-	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
-	/* Um, eww */
-	const pte_t pte = { .pte = pudval.pgd.pgd };
-	vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	const pte_t pte = { .pte = 0 };
-	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
-	const pte_t pte = { .pte = 0 };
-	vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
-		     unsigned long start_esp)
-{
-	struct vmi_ap_state ap;
-
-	/* Default everything to zero.  This is fine for most GPRs. */
-	memset(&ap, 0, sizeof(struct vmi_ap_state));
-
-	ap.gdtr_limit = GDT_SIZE - 1;
-	ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
-	ap.idtr_limit = IDT_ENTRIES * 8 - 1;
-	ap.idtr_base = (unsigned long) idt_table;
-
-	ap.ldtr = 0;
-
-	ap.cs = __KERNEL_CS;
-	ap.eip = (unsigned long) start_eip;
-	ap.ss = __KERNEL_DS;
-	ap.esp = (unsigned long) start_esp;
-
-	ap.ds = __USER_DS;
-	ap.es = __USER_DS;
-	ap.fs = __KERNEL_PERCPU;
-	ap.gs = __KERNEL_STACK_CANARY;
-
-	ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
-	/* efer should match BSP efer. */
-	if (cpu_has_nx) {
-		unsigned l, h;
-		rdmsr(MSR_EFER, l, h);
-		ap.efer = (unsigned long long) h << 32 | l;
-	}
-#endif
-
-	ap.cr3 = __pa(swapper_pg_dir);
-	/* Protected mode, paging, AM, WP, NE, MP. */
-	ap.cr0 = 0x80050023;
-	ap.cr4 = mmu_cr4_features;
-	vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
-	paravirt_start_context_switch(prev);
-	vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
-	vmi_ops.set_lazy_mode(0);
-	paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
-	paravirt_enter_lazy_mmu();
-	vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
-	vmi_ops.set_lazy_mode(0);
-	paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
-	struct pci_header *pci;
-	struct pnp_header *pnp;
-	const char *manufacturer = "UNKNOWN";
-	const char *product = "UNKNOWN";
-	const char *license = "unspecified";
-
-	if (rom->rom_signature != 0xaa55)
-		return 0;
-	if (rom->vrom_signature != VMI_SIGNATURE)
-		return 0;
-	if (rom->api_version_maj != VMI_API_REV_MAJOR ||
-	    rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
-		printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
-				rom->api_version_maj,
-				rom->api_version_min);
-		return 0;
-	}
-
-	/*
-	 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
-	 * the PCI header and device type to make sure this is really a
-	 * VMI device.
-	 */
-	if (!rom->pci_header_offs) {
-		printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
-		return 0;
-	}
-
-	pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
-	if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
-	    pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
-		/* Allow it to run... anyways, but warn */
-		printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
-	}
-
-	if (rom->pnp_header_offs) {
-		pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
-		if (pnp->manufacturer_offset)
-			manufacturer = (const char *)rom+pnp->manufacturer_offset;
-		if (pnp->product_offset)
-			product = (const char *)rom+pnp->product_offset;
-	}
-
-	if (rom->license_offs)
-		license = (char *)rom+rom->license_offs;
-
-	printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
-		manufacturer, product,
-		rom->api_version_maj, rom->api_version_min,
-		pci->rom_version_maj, pci->rom_version_min);
-
-	/* Don't allow BSD/MIT here for now because we don't want to end up
-	   with any binary only shim layers */
-	if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
-		printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
-			license);
-		return 0;
-	}
-
-	return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
-	unsigned long base;
-
-	/* VMI ROM is in option ROM area, check signature */
-	for (base = 0xC0000; base < 0xE0000; base += 2048) {
-		struct vrom_header *romstart;
-		romstart = (struct vrom_header *)isa_bus_to_virt(base);
-		if (check_vmi_rom(romstart)) {
-			vmi_rom = romstart;
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- 	/* We must establish the lowmem mapping for MMU ops to work */
-	if (vmi_ops.set_linear_mapping)
-		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
-	u64 reloc;
-	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,	vmicall);
-	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
-	if (rel->type == VMI_RELOCATION_CALL_REL)
-		return (void *)rel->eip;
-	else
-		return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall)				\
-do {								\
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
-				    VMI_CALL_##vmicall);	\
-	if (rel->type == VMI_RELOCATION_CALL_REL) 		\
-		opname = (void *)rel->eip;			\
-	else if (rel->type == VMI_RELOCATION_NOP) 		\
-		opname = (void *)vmi_nop;			\
-	else if (rel->type != VMI_RELOCATION_NONE)		\
-		printk(KERN_WARNING "VMI: Unknown relocation "	\
-				    "type %d for " #vmicall"\n",\
-					rel->type);		\
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub.  Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall)		\
-do {								\
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
-				    VMI_CALL_##vmicall);	\
-	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);		\
-	if (rel->type == VMI_RELOCATION_CALL_REL) {		\
-		opname = wrapper;				\
-		vmi_ops.cache = (void *)rel->eip;		\
-	}							\
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
-	short kernel_cs;
-	u64 reloc;
-	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
-	/*
-	 * Prevent page tables from being allocated in highmem, even if
-	 * CONFIG_HIGHPTE is enabled.
-	 */
-	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
-	if (call_vrom_func(vmi_rom, vmi_init) != 0) {
-		printk(KERN_ERR "VMI ROM failed to initialize!");
-		return 0;
-	}
-	savesegment(cs, kernel_cs);
-
-	pv_info.paravirt_enabled = 1;
-	pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
-	pv_info.name = "vmi [deprecated]";
-
-	pv_init_ops.patch = vmi_patch;
-
-	/*
-	 * Many of these operations are ABI compatible with VMI.
-	 * This means we can fill in the paravirt-ops with direct
-	 * pointers into the VMI ROM.  If the calling convention for
-	 * these operations changes, this code needs to be updated.
-	 *
-	 * Exceptions
-	 *  CPUID paravirt-op uses pointers, not the native ISA
-	 *  halt has no VMI equivalent; all VMI halts are "safe"
-	 *  no MSR support yet - just trap and emulate.  VMI uses the
-	 *    same ABI as the native ISA, but Linux wants exceptions
-	 *    from bogus MSR read / write handled
-	 *  rdpmc is not yet used in Linux
-	 */
-
-	/* CPUID is special, so very special it gets wrapped like a present */
-	para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
-	para_fill(pv_cpu_ops.clts, CLTS);
-	para_fill(pv_cpu_ops.get_debugreg, GetDR);
-	para_fill(pv_cpu_ops.set_debugreg, SetDR);
-	para_fill(pv_cpu_ops.read_cr0, GetCR0);
-	para_fill(pv_mmu_ops.read_cr2, GetCR2);
-	para_fill(pv_mmu_ops.read_cr3, GetCR3);
-	para_fill(pv_cpu_ops.read_cr4, GetCR4);
-	para_fill(pv_cpu_ops.write_cr0, SetCR0);
-	para_fill(pv_mmu_ops.write_cr2, SetCR2);
-	para_fill(pv_mmu_ops.write_cr3, SetCR3);
-	para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
-	para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
-	para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
-	para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
-	para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
-	para_fill(pv_cpu_ops.wbinvd, WBINVD);
-	para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
-	/* The following we emulate with trap and emulate for now */
-	/* paravirt_ops.read_msr = vmi_rdmsr */
-	/* paravirt_ops.write_msr = vmi_wrmsr */
-	/* paravirt_ops.rdpmc = vmi_rdpmc */
-
-	/* TR interface doesn't pass TR value, wrap */
-	para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
-	/* LDT is special, too */
-	para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
-	para_fill(pv_cpu_ops.load_gdt, SetGDT);
-	para_fill(pv_cpu_ops.load_idt, SetIDT);
-	para_fill(pv_cpu_ops.store_gdt, GetGDT);
-	para_fill(pv_cpu_ops.store_idt, GetIDT);
-	para_fill(pv_cpu_ops.store_tr, GetTR);
-	pv_cpu_ops.load_tls = vmi_load_tls;
-	para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
-		  write_ldt_entry, WriteLDTEntry);
-	para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
-		  write_gdt_entry, WriteGDTEntry);
-	para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
-		  write_idt_entry, WriteIDTEntry);
-	para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
-	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
-	para_fill(pv_cpu_ops.io_delay, IODelay);
-
-	para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
-		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
-		  set_lazy_mode, SetLazyMode);
-
-	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
-		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
-		  set_lazy_mode, SetLazyMode);
-
-	/* user and kernel flush are just handled with different flags to FlushTLB */
-	para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
-	para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
-	para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
-	/*
-	 * Until a standard flag format can be agreed on, we need to
-	 * implement these as wrappers in Linux.  Get the VMI ROM
-	 * function pointers for the two backend calls.
-	 */
-#ifdef CONFIG_X86_PAE
-	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
-	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
-	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
-	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
-	if (vmi_ops.set_pte) {
-		pv_mmu_ops.set_pte = vmi_set_pte;
-		pv_mmu_ops.set_pte_at = vmi_set_pte_at;
-		pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
-		pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-		pv_mmu_ops.set_pud = vmi_set_pud;
-		pv_mmu_ops.pte_clear = vmi_pte_clear;
-		pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
-	}
-
-	if (vmi_ops.update_pte) {
-		pv_mmu_ops.pte_update = vmi_update_pte;
-		pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
-	}
-
-	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
-	if (vmi_ops.allocate_page) {
-		pv_mmu_ops.alloc_pte = vmi_allocate_pte;
-		pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
-		pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
-	}
-
-	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
-	if (vmi_ops.release_page) {
-		pv_mmu_ops.release_pte = vmi_release_pte;
-		pv_mmu_ops.release_pmd = vmi_release_pmd;
-		pv_mmu_ops.pgd_free = vmi_pgd_free;
-	}
-
-	/* Set linear is needed in all cases */
-	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
-	/*
-	 * These MUST always be patched.  Don't support indirect jumps
-	 * through these operations, as the VMI interface may use either
-	 * a jump or a call to get to these operations, depending on
-	 * the backend.  They are performance critical anyway, so requiring
-	 * a patch is not a big problem.
-	 */
-	pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
-	pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
-	para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       para_fill(apic->read, APICRead);
-       para_fill(apic->write, APICWrite);
-#endif
-
-	/*
-	 * Check for VMI timer functionality by probing for a cycle frequency method
-	 */
-	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
-	if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
-		vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
-		vmi_timer_ops.get_cycle_counter =
-			vmi_get_function(VMI_CALL_GetCycleCounter);
-		vmi_timer_ops.get_wallclock =
-			vmi_get_function(VMI_CALL_GetWallclockTime);
-		vmi_timer_ops.wallclock_updated =
-			vmi_get_function(VMI_CALL_WallclockUpdated);
-		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
-		vmi_timer_ops.cancel_alarm =
-			 vmi_get_function(VMI_CALL_CancelAlarm);
-		x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
-		x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
-		x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
-		pv_time_ops.sched_clock = vmi_sched_clock;
-		x86_platform.calibrate_tsc = vmi_tsc_khz;
-		x86_platform.get_wallclock = vmi_get_wallclock;
-		x86_platform.set_wallclock = vmi_set_wallclock;
-
-		/* We have true wallclock functions; disable CMOS clock sync */
-		no_sync_cmos_clock = 1;
-	} else {
-		disable_noidle = 1;
-		disable_vmi_timer = 1;
-	}
-
-	para_fill(pv_irq_ops.safe_halt, Halt);
-
-	/*
-	 * Alternative instruction rewriting doesn't happen soon enough
-	 * to convert VMI_IRET to a call instead of a jump; so we have
-	 * to do this before IRQs get reenabled.  Fortunately, it is
-	 * idempotent.
-	 */
-	apply_paravirt(__parainstructions, __parainstructions_end);
-
-	vmi_bringup();
-
-	return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
-	if (!vmi_rom)
-		probe_vmi_rom();
-	else
-		check_vmi_rom(vmi_rom);
-
-	/* In case probing for or validating the ROM failed, basil */
-	if (!vmi_rom)
-		return;
-
-	reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
-	/* This is virtual hardware; timer routing is wired correctly */
-	no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
-	unsigned long flags;
-
-	if (!vmi_rom)
-		return;
-
-	local_irq_save(flags);
-	activate_vmi();
-	local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (!strcmp(arg, "disable_pge")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-		disable_pge = 1;
-	} else if (!strcmp(arg, "disable_pse")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
-		disable_pse = 1;
-	} else if (!strcmp(arg, "disable_sep")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
-		disable_sep = 1;
-	} else if (!strcmp(arg, "disable_tsc")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
-		disable_tsc = 1;
-	} else if (!strcmp(arg, "disable_mtrr")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
-		disable_mtrr = 1;
-	} else if (!strcmp(arg, "disable_timer")) {
-		disable_vmi_timer = 1;
-		disable_noidle = 1;
-	} else if (!strcmp(arg, "disable_noidle"))
-		disable_noidle = 1;
-	return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd7..00000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
-	/* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
-	 * cycle counter. */
-	return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
-	unsigned long long wallclock;
-	wallclock = vmi_timer_ops.get_wallclock(); // nsec
-	(void)do_div(wallclock, 1000000000);       // sec
-
-	return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
-	return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
-	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
-	unsigned long long khz;
-	khz = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(khz, 1000);
-	return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
-	return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
-	unsigned long val = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, vmi_get_timer_vector());
-
-	return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
-	unsigned long val = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
-	unsigned long val = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
-	ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
-	.name 		= "VMI-LOCAL",
-	.startup 	= startup_timer_irq,
-	.mask	 	= mask_timer_irq,
-	.unmask	 	= unmask_timer_irq,
-	.ack 		= ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
-	return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
-			       struct clock_event_device *evt)
-{
-	cycle_t now, cycles_per_hz;
-	BUG_ON(!irqs_disabled());
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	case CLOCK_EVT_MODE_PERIODIC:
-		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
-		(void)do_div(cycles_per_hz, HZ);
-		now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
-		vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		switch (evt->mode) {
-		case CLOCK_EVT_MODE_ONESHOT:
-			vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
-			break;
-		case CLOCK_EVT_MODE_PERIODIC:
-			vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
-			break;
-		default:
-			break;
-		}
-		break;
-	default:
-		break;
-	}
-}
-
-static int vmi_timer_next_event(unsigned long delta,
-				struct clock_event_device *evt)
-{
-	/* Unfortunately, set_next_event interface only passes relative
-	 * expiry, but we want absolute expiry.  It'd be better if were
-	 * were passed an absolute expiry, since a bunch of time may
-	 * have been stolen between the time the delta is computed and
-	 * when we set the alarm below. */
-	cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
-	BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-	vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
-	return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
-	.name		= "vmi-timer",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.shift		= 22,
-	.set_mode	= vmi_timer_set_mode,
-	.set_next_event = vmi_timer_next_event,
-	.rating         = 1000,
-	.irq		= 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
-	struct clock_event_device *evt = &__get_cpu_var(local_events);
-	evt->event_handler(evt);
-	return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action  = {
-	.name 		= "vmi-timer",
-	.handler 	= vmi_timer_interrupt,
-	.flags 		= IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
-	cycle_t cycles_per_msec;
-	struct clock_event_device *evt;
-
-	int cpu = smp_processor_id();
-	evt = &__get_cpu_var(local_events);
-
-	/* Use cycles_per_msec since div_sc params are 32-bits. */
-	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(cycles_per_msec, 1000);
-
-	memcpy(evt, &vmi_clockevent, sizeof(*evt));
-	/* Must pick .shift such that .mult fits in 32-bits.  Choosing
-	 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
-	 * before overflow. */
-	evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
-	/* Upper bound is clockevent's use of ulong for cycle deltas. */
-	evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
-	evt->min_delta_ns = clockevent_delta2ns(1, evt);
-	evt->cpumask = cpumask_of(cpu);
-
-	printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
-	       evt->name, evt->mult, evt->shift);
-	clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
-	unsigned int cpu;
-	/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
-	outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
-	vmi_time_init_clockevent();
-	setup_irq(0, &vmi_clock_action);
-	for_each_possible_cpu(cpu)
-		per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
-	/*
-	 * On APIC systems, we want local timers to fire on each cpu.  We do
-	 * this by programming LVTT to deliver timer events to the IRQ handler
-	 * for IRQ-0, since we can't re-use the APIC local timer handler
-	 * without interfering with that code.
-	 */
-	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-	local_irq_disable();
-#ifdef CONFIG_SMP
-	/*
-	 * XXX handle_percpu_irq only defined for SMP; we need to switch over
-	 * to using it, since this is a local interrupt, which each CPU must
-	 * handle individually without locking out or dropping simultaneous
-	 * local timers on other CPUs.  We also don't want to trigger the
-	 * quirk workaround code for interrupts which gets invoked from
-	 * handle_percpu_irq via eoi, so we use our own IRQ chip.
-	 */
-	set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
-	set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
-	vmi_wiring = VMI_ALARM_WIRED_LVTT;
-	apic_write(APIC_LVTT, vmi_get_timer_vector());
-	local_irq_enable();
-	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
-	vmi_time_init_clockevent();
-	apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
-	cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-	return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
-	.name			= "vmi-timer",
-	.rating			= 450,
-	.read			= read_real_cycles,
-	.mask			= CLOCKSOURCE_MASK(64),
-	.mult			= 0, /* to be set */
-	.shift			= 22,
-	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
-	cycle_t cycles_per_msec;
-
-	if (!vmi_timer_ops.get_cycle_frequency)
-		return 0;
-	/* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
-	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(cycles_per_msec, 1000);
-
-	/* Note that clocksource.{mult, shift} converts in the opposite direction
-	 * as clockevents.  */
-	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
-						    clocksource_vmi.shift);
-
-	printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
-	return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f4817..22b06f7660f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.apic = apic;
 
-	apic->regs_page = alloc_page(GFP_KERNEL);
+	apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
 	if (apic->regs_page == NULL) {
 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
 		       vcpu->vcpu_id);
 		goto nomem_free_apic;
 	}
 	apic->regs = page_address(apic->regs_page);
-	memset(apic->regs, 0, PAGE_SIZE);
 	apic->vcpu = vcpu;
 
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d52..6c2ecf0a806 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1991,13 +1991,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
+		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+		F(F16C);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
-		0 /* SKINIT */ | 0 /* WDT */;
+		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f5..b908a59eccf 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
 
 void *memmove(void *dest, const void *src, size_t n)
 {
-	int d0, d1, d2;
-
-	if (dest < src) {
-		memcpy(dest, src, n);
-	} else {
-		__asm__ __volatile__(
-			"std\n\t"
-			"rep\n\t"
-			"movsb\n\t"
-			"cld"
-			: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-			:"0" (n),
-			 "1" (n-1+src),
-			 "2" (n-1+dest)
-			:"memory");
-	}
-	return dest;
+	int d0,d1,d2,d3,d4,d5;
+	char *ret = dest;
+
+	__asm__ __volatile__(
+		/* Handle more 16bytes in loop */
+		"cmp $0x10, %0\n\t"
+		"jb	1f\n\t"
+
+		/* Decide forward/backward copy mode */
+		"cmp %2, %1\n\t"
+		"jb	2f\n\t"
+
+		/*
+		 * movs instruction have many startup latency
+		 * so we handle small size by general register.
+		 */
+		"cmp  $680, %0\n\t"
+		"jb 3f\n\t"
+		/*
+		 * movs instruction is only good for aligned case.
+		 */
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 4f\n\t"
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+
+		/*
+		 * We gobble 16byts forward in each loop.
+		 */
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov 2*4(%1), %3\n\t"
+		"mov 3*4(%1), %4\n\t"
+		"mov  %3, 2*4(%2)\n\t"
+		"mov  %4, 3*4(%2)\n\t"
+		"lea  0x10(%1), %1\n\t"
+		"lea  0x10(%2), %2\n\t"
+		"jae 3b\n\t"
+		"add $0x10, %0\n\t"
+		"jmp 1f\n\t"
+
+		/*
+		 * Handle data forward by movs.
+		 */
+		".p2align 4\n\t"
+		"4:\n\t"
+		"mov -4(%1, %0), %3\n\t"
+		"lea -4(%2, %0), %4\n\t"
+		"shr $2, %0\n\t"
+		"rep movsl\n\t"
+		"mov %3, (%4)\n\t"
+		"jmp 11f\n\t"
+		/*
+		 * Handle data backward by movs.
+		 */
+		".p2align 4\n\t"
+		"6:\n\t"
+		"mov (%1), %3\n\t"
+		"mov %2, %4\n\t"
+		"lea -4(%1, %0), %1\n\t"
+		"lea -4(%2, %0), %2\n\t"
+		"shr $2, %0\n\t"
+		"std\n\t"
+		"rep movsl\n\t"
+		"mov %3,(%4)\n\t"
+		"cld\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Start to prepare for backward copy.
+		 */
+		".p2align 4\n\t"
+		"2:\n\t"
+		"cmp  $680, %0\n\t"
+		"jb 5f\n\t"
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 6b\n\t"
+
+		/*
+		 * Calculate copy position to tail.
+		 */
+		"5:\n\t"
+		"add %0, %1\n\t"
+		"add %0, %2\n\t"
+		"sub $0x10, %0\n\t"
+
+		/*
+		 * We gobble 16byts backward in each loop.
+		 */
+		"7:\n\t"
+		"sub $0x10, %0\n\t"
+
+		"mov -1*4(%1), %3\n\t"
+		"mov -2*4(%1), %4\n\t"
+		"mov  %3, -1*4(%2)\n\t"
+		"mov  %4, -2*4(%2)\n\t"
+		"mov -3*4(%1), %3\n\t"
+		"mov -4*4(%1), %4\n\t"
+		"mov  %3, -3*4(%2)\n\t"
+		"mov  %4, -4*4(%2)\n\t"
+		"lea  -0x10(%1), %1\n\t"
+		"lea  -0x10(%2), %2\n\t"
+		"jae 7b\n\t"
+		/*
+		 * Calculate copy position to head.
+		 */
+		"add $0x10, %0\n\t"
+		"sub %0, %1\n\t"
+		"sub %0, %2\n\t"
+
+		/*
+		 * Move data from 8 bytes to 15 bytes.
+		 */
+		".p2align 4\n\t"
+		"1:\n\t"
+		"cmp $8, %0\n\t"
+		"jb 8f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov -2*4(%1, %0), %5\n\t"
+		"mov -1*4(%1, %0), %1\n\t"
+
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov  %5, -2*4(%2, %0)\n\t"
+		"mov  %1, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data from 4 bytes to 7 bytes.
+		 */
+		".p2align 4\n\t"
+		"8:\n\t"
+		"cmp $4, %0\n\t"
+		"jb 9f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov -1*4(%1, %0), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data from 2 bytes to 3 bytes.
+		 */
+		".p2align 4\n\t"
+		"9:\n\t"
+		"cmp $2, %0\n\t"
+		"jb 10f\n\t"
+		"movw 0*2(%1), %%dx\n\t"
+		"movw -1*2(%1, %0), %%bx\n\t"
+		"movw %%dx, 0*2(%2)\n\t"
+		"movw %%bx, -1*2(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data for 1 byte.
+		 */
+		".p2align 4\n\t"
+		"10:\n\t"
+		"cmp $1, %0\n\t"
+		"jb 11f\n\t"
+		"movb (%1), %%cl\n\t"
+		"movb %%cl, (%2)\n\t"
+		".p2align 4\n\t"
+		"11:"
+		: "=&c" (d0), "=&S" (d1), "=&D" (d2),
+		  "=r" (d3),"=r" (d4), "=r"(d5)
+		:"0" (n),
+		 "1" (src),
+		 "2" (dest)
+		:"memory");
+
+	return ret;
+
 }
 EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d..75ef61e35e3 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
+	movq %rdi, %rax
 
 	/*
-	 * Put the number of full 64-byte blocks into %ecx.
-	 * Tail portion is handled at the end:
+	 * Use 32bit CMP here to avoid long NOP padding.
 	 */
-	movq %rdi, %rax
-	movl %edx, %ecx
-	shrl   $6, %ecx
-	jz .Lhandle_tail
+	cmp  $0x20, %edx
+	jb .Lhandle_tail
 
-	.p2align 4
-.Lloop_64:
 	/*
-	 * We decrement the loop index here - and the zero-flag is
-	 * checked at the end of the loop (instructions inbetween do
-	 * not change the zero flag):
+	 * We check whether memory false dependece could occur,
+	 * then jump to corresponding copy mode.
 	 */
-	decl %ecx
+	cmp  %dil, %sil
+	jl .Lcopy_backward
+	subl $0x20, %edx
+.Lcopy_forward_loop:
+	subq $0x20,	%rdx
 
 	/*
-	 * Move in blocks of 4x16 bytes:
+	 * Move in blocks of 4x8 bytes:
 	 */
-	movq 0*8(%rsi),		%r11
-	movq 1*8(%rsi),		%r8
-	movq %r11,		0*8(%rdi)
-	movq %r8,		1*8(%rdi)
-
-	movq 2*8(%rsi),		%r9
-	movq 3*8(%rsi),		%r10
-	movq %r9,		2*8(%rdi)
-	movq %r10,		3*8(%rdi)
-
-	movq 4*8(%rsi),		%r11
-	movq 5*8(%rsi),		%r8
-	movq %r11,		4*8(%rdi)
-	movq %r8,		5*8(%rdi)
-
-	movq 6*8(%rsi),		%r9
-	movq 7*8(%rsi),		%r10
-	movq %r9,		6*8(%rdi)
-	movq %r10,		7*8(%rdi)
-
-	leaq 64(%rsi), %rsi
-	leaq 64(%rdi), %rdi
-
-	jnz  .Lloop_64
+	movq 0*8(%rsi),	%r8
+	movq 1*8(%rsi),	%r9
+	movq 2*8(%rsi),	%r10
+	movq 3*8(%rsi),	%r11
+	leaq 4*8(%rsi),	%rsi
+
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	2*8(%rdi)
+	movq %r11,	3*8(%rdi)
+	leaq 4*8(%rdi),	%rdi
+	jae  .Lcopy_forward_loop
+	addq $0x20,	%rdx
+	jmp  .Lhandle_tail
+
+.Lcopy_backward:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx,	%rsi
+	addq %rdx,	%rdi
+	subq $0x20,	%rdx
+	/*
+	 * At most 3 ALU operations in one cycle,
+	 * so append NOPS in the same 16bytes trunk.
+	 */
+	.p2align 4
+.Lcopy_backward_loop:
+	subq $0x20,	%rdx
+	movq -1*8(%rsi),	%r8
+	movq -2*8(%rsi),	%r9
+	movq -3*8(%rsi),	%r10
+	movq -4*8(%rsi),	%r11
+	leaq -4*8(%rsi),	%rsi
+	movq %r8,		-1*8(%rdi)
+	movq %r9,		-2*8(%rdi)
+	movq %r10,		-3*8(%rdi)
+	movq %r11,		-4*8(%rdi)
+	leaq -4*8(%rdi),	%rdi
+	jae  .Lcopy_backward_loop
 
+	/*
+	 * Calculate copy position to head.
+	 */
+	addq $0x20,	%rdx
+	subq %rdx,	%rsi
+	subq %rdx,	%rdi
 .Lhandle_tail:
-	movl %edx, %ecx
-	andl  $63, %ecx
-	shrl   $3, %ecx
-	jz   .Lhandle_7
+	cmpq $16,	%rdx
+	jb   .Lless_16bytes
 
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r8
+	movq 1*8(%rsi),	%r9
+	movq -2*8(%rsi, %rdx),	%r10
+	movq -1*8(%rsi, %rdx),	%r11
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	-2*8(%rdi, %rdx)
+	movq %r11,	-1*8(%rdi, %rdx)
+	retq
 	.p2align 4
-.Lloop_8:
-	decl %ecx
-	movq (%rsi),		%r8
-	movq %r8,		(%rdi)
-	leaq 8(%rdi),		%rdi
-	leaq 8(%rsi),		%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx, %ecx
-	andl $7, %ecx
-	jz .Lend
+.Lless_16bytes:
+	cmpq $8,	%rdx
+	jb   .Lless_8bytes
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi),	%r8
+	movq -1*8(%rsi, %rdx),	%r9
+	movq %r8,	0*8(%rdi)
+	movq %r9,	-1*8(%rdi, %rdx)
+	retq
+	.p2align 4
+.Lless_8bytes:
+	cmpq $4,	%rdx
+	jb   .Lless_3bytes
 
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %ecx
+	movl -4(%rsi, %rdx), %r8d
+	movl %ecx, (%rdi)
+	movl %r8d, -4(%rdi, %rdx)
+	retq
 	.p2align 4
+.Lless_3bytes:
+	cmpl $0, %edx
+	je .Lend
+	/*
+	 * Move data from 1 bytes to 3 bytes.
+	 */
 .Lloop_1:
 	movb (%rsi), %r8b
 	movb %r8b, (%rdi)
 	incq %rdi
 	incq %rsi
-	decl %ecx
+	decl %edx
 	jnz .Lloop_1
 
 .Lend:
-	ret
+	retq
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909bf12..6d0f0ec41b3 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,185 @@
 #undef memmove
 void *memmove(void *dest, const void *src, size_t count)
 {
-	if (dest < src) {
-		return memcpy(dest, src, count);
-	} else {
-		char *p = dest + count;
-		const char *s = src + count;
-		while (count--)
-			*--p = *--s;
-	}
-	return dest;
+	unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
+	char *ret;
+
+	__asm__ __volatile__(
+		/* Handle more 32bytes in loop */
+		"mov %2, %3\n\t"
+		"cmp $0x20, %0\n\t"
+		"jb	1f\n\t"
+
+		/* Decide forward/backward copy mode */
+		"cmp %2, %1\n\t"
+		"jb	2f\n\t"
+
+		/*
+		 * movsq instruction have many startup latency
+		 * so we handle small size by general register.
+		 */
+		"cmp  $680, %0\n\t"
+		"jb 3f\n\t"
+		/*
+		 * movsq instruction is only good for aligned case.
+		 */
+		"cmpb %%dil, %%sil\n\t"
+		"je 4f\n\t"
+		"3:\n\t"
+		"sub $0x20, %0\n\t"
+		/*
+		 * We gobble 32byts forward in each loop.
+		 */
+		"5:\n\t"
+		"sub $0x20, %0\n\t"
+		"movq 0*8(%1), %4\n\t"
+		"movq 1*8(%1), %5\n\t"
+		"movq 2*8(%1), %6\n\t"
+		"movq 3*8(%1), %7\n\t"
+		"leaq 4*8(%1), %1\n\t"
+
+		"movq %4, 0*8(%2)\n\t"
+		"movq %5, 1*8(%2)\n\t"
+		"movq %6, 2*8(%2)\n\t"
+		"movq %7, 3*8(%2)\n\t"
+		"leaq 4*8(%2), %2\n\t"
+		"jae 5b\n\t"
+		"addq $0x20, %0\n\t"
+		"jmp 1f\n\t"
+		/*
+		 * Handle data forward by movsq.
+		 */
+		".p2align 4\n\t"
+		"4:\n\t"
+		"movq %0, %8\n\t"
+		"movq -8(%1, %0), %4\n\t"
+		"lea -8(%2, %0), %5\n\t"
+		"shrq $3, %8\n\t"
+		"rep movsq\n\t"
+		"movq %4, (%5)\n\t"
+		"jmp 13f\n\t"
+		/*
+		 * Handle data backward by movsq.
+		 */
+		".p2align 4\n\t"
+		"7:\n\t"
+		"movq %0, %8\n\t"
+		"movq (%1), %4\n\t"
+		"movq %2, %5\n\t"
+		"leaq -8(%1, %0), %1\n\t"
+		"leaq -8(%2, %0), %2\n\t"
+		"shrq $3, %8\n\t"
+		"std\n\t"
+		"rep movsq\n\t"
+		"cld\n\t"
+		"movq %4, (%5)\n\t"
+		"jmp 13f\n\t"
+
+		/*
+		 * Start to prepare for backward copy.
+		 */
+		".p2align 4\n\t"
+		"2:\n\t"
+		"cmp $680, %0\n\t"
+		"jb 6f \n\t"
+		"cmp %%dil, %%sil\n\t"
+		"je 7b \n\t"
+		"6:\n\t"
+		/*
+		 * Calculate copy position to tail.
+		 */
+		"addq %0, %1\n\t"
+		"addq %0, %2\n\t"
+		"subq $0x20, %0\n\t"
+		/*
+		 * We gobble 32byts backward in each loop.
+		 */
+		"8:\n\t"
+		"subq $0x20, %0\n\t"
+		"movq -1*8(%1), %4\n\t"
+		"movq -2*8(%1), %5\n\t"
+		"movq -3*8(%1), %6\n\t"
+		"movq -4*8(%1), %7\n\t"
+		"leaq -4*8(%1), %1\n\t"
+
+		"movq %4, -1*8(%2)\n\t"
+		"movq %5, -2*8(%2)\n\t"
+		"movq %6, -3*8(%2)\n\t"
+		"movq %7, -4*8(%2)\n\t"
+		"leaq -4*8(%2), %2\n\t"
+		"jae 8b\n\t"
+		/*
+		 * Calculate copy position to head.
+		 */
+		"addq $0x20, %0\n\t"
+		"subq %0, %1\n\t"
+		"subq %0, %2\n\t"
+		"1:\n\t"
+		"cmpq $16, %0\n\t"
+		"jb 9f\n\t"
+		/*
+		 * Move data from 16 bytes to 31 bytes.
+		 */
+		"movq 0*8(%1), %4\n\t"
+		"movq 1*8(%1), %5\n\t"
+		"movq -2*8(%1, %0), %6\n\t"
+		"movq -1*8(%1, %0), %7\n\t"
+		"movq %4, 0*8(%2)\n\t"
+		"movq %5, 1*8(%2)\n\t"
+		"movq %6, -2*8(%2, %0)\n\t"
+		"movq %7, -1*8(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		".p2align 4\n\t"
+		"9:\n\t"
+		"cmpq $8, %0\n\t"
+		"jb 10f\n\t"
+		/*
+		 * Move data from 8 bytes to 15 bytes.
+		 */
+		"movq 0*8(%1), %4\n\t"
+		"movq -1*8(%1, %0), %5\n\t"
+		"movq %4, 0*8(%2)\n\t"
+		"movq %5, -1*8(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		"10:\n\t"
+		"cmpq $4, %0\n\t"
+		"jb 11f\n\t"
+		/*
+		 * Move data from 4 bytes to 7 bytes.
+		 */
+		"movl (%1), %4d\n\t"
+		"movl -4(%1, %0), %5d\n\t"
+		"movl %4d, (%2)\n\t"
+		"movl %5d, -4(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		"11:\n\t"
+		"cmp $2, %0\n\t"
+		"jb 12f\n\t"
+		/*
+		 * Move data from 2 bytes to 3 bytes.
+		 */
+		"movw (%1), %4w\n\t"
+		"movw -2(%1, %0), %5w\n\t"
+		"movw %4w, (%2)\n\t"
+		"movw %5w, -2(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		"12:\n\t"
+		"cmp $1, %0\n\t"
+		"jb 13f\n\t"
+		/*
+		 * Move data for 1 byte.
+		 */
+		"movb (%1), %4b\n\t"
+		"movb %4b, (%2)\n\t"
+		"13:\n\t"
+		: "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
+		  "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
+		:"0" (count),
+		 "1" (src),
+		 "2" (dest)
+		:"memory");
+
+		return ret;
+
 }
 EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a20..79b0b372d2d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			pmd_t *ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +260,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
 		return -1;
 
+	WARN_ON_ONCE(in_nmi());
+
 	/*
 	 * Synchronize this task's top level page-table
 	 * with the 'reference' page table.
@@ -326,29 +337,7 @@ out:
 
 void vmalloc_sync_all(void)
 {
-	unsigned long address;
-
-	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-	     address += PGDIR_SIZE) {
-
-		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
-		struct page *page;
-
-		if (pgd_none(*pgd_ref))
-			continue;
-
-		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			pgd_t *pgd;
-			pgd = (pgd_t *)page_address(page) + pgd_index(address);
-			if (pgd_none(*pgd))
-				set_pgd(pgd, *pgd_ref);
-			else
-				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
+	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
 }
 
 /*
@@ -369,6 +358,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
 		return -1;
 
+	WARN_ON_ONCE(in_nmi());
+
 	/*
 	 * Copy kernel mappings over when needed. This can also
 	 * happen within a race in page table update. In the later
@@ -894,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	if (pmd_large(*pmd))
 		return spurious_fault_check(error_code, (pte_t *) pmd);
 
+	/*
+	 * Note: don't use pte_present() here, since it returns true
+	 * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+	 * _PAGE_GLOBAL bit, which for kernel pages give false positives
+	 * when CONFIG_DEBUG_PAGEALLOC is used.
+	 */
 	pte = pte_offset_kernel(pmd, address);
-	if (!pte_present(*pte))
+	if (!(pte_flags(*pte) & _PAGE_PRESENT))
 		return 0;
 
 	ret = spurious_fault_check(error_code, pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d..558f2d33207 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = __va(pfn * PAGE_SIZE);
-	memset(adr, 0, PAGE_SIZE);
+	clear_page(adr);
 	return adr;
 }
 
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
 
 static inline void save_pg_dir(void)
 {
-	memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+	copy_page(swsusp_pg_dir, swapper_pg_dir);
 }
 #else /* !CONFIG_ACPI_SLEEP */
 static inline void save_pg_dir(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a2..c55f900fbf8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str)
 __setup("noexec32=", nonx32_setup);
 
 /*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+	unsigned long address;
+
+	for (address = start; address <= end; address += PGDIR_SIZE) {
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		unsigned long flags;
+		struct page *page;
+
+		if (pgd_none(*pgd_ref))
+			continue;
+
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
+			if (pgd_none(*pgd))
+				set_pgd(pgd, *pgd_ref);
+			else
+				BUG_ON(pgd_page_vaddr(*pgd)
+				       != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
+		}
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}
+}
+
+/*
  * NOTE: This function is marked __ref because it calls __init function
  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
  */
@@ -293,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-	memset(adr, 0, PAGE_SIZE);
+	clear_page(adr);
 	*phys  = pfn * PAGE_SIZE;
 	return adr;
 }
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
 			     unsigned long page_size_mask)
 {
-
+	bool pgd_changed = false;
 	unsigned long next, last_map_addr = end;
+	unsigned long addr;
 
 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
+	addr = start;
 
 	for (; start < end; start = next) {
 		pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start,
 		spin_lock(&init_mm.page_table_lock);
 		pgd_populate(&init_mm, pgd, __va(pud_phys));
 		spin_unlock(&init_mm.page_table_lock);
+		pgd_changed = true;
 	}
+
+	if (pgd_changed)
+		sync_global_pgds(addr, end);
+
 	__flush_tlb_all();
 
 	return last_map_addr;
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
 		}
 
 	}
+	sync_global_pgds((unsigned long)start_page, end);
 	return 0;
 }
 
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e..52d54bfc1eb 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
 #include <asm/numa.h>
 #include <asm/mpspec.h>
 #include <asm/apic.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 
 static struct bootnode __initdata nodes[8];
 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
 static __init void early_get_boot_cpu_id(void)
 {
 	/*
-	 * need to get boot_cpu_id so can use that to create apicid_to_node
-	 * in k8_scan_nodes()
+	 * need to get the APIC ID of the BSP so can use that to
+	 * create apicid_to_node in k8_scan_nodes()
 	 */
 #ifdef CONFIG_X86_MPPARSE
 	/*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
 	bits = boot_cpu_data.x86_coreid_bits;
 	cores = (1<<bits);
 	apicid_base = 0;
-	/* need to get boot_cpu_id early for system with apicid lifting */
+	/* get the APIC ID of the BSP early for systems with apicid lifting */
 	early_get_boot_cpu_id();
 	if (boot_cpu_physical_apicid > 0) {
 		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e..d87dd6d042d 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
 	if (!pte)
 		return false;
 
+	WARN_ON_ONCE(in_nmi());
+
 	if (error_code & 2)
 		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
 	else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6..324aa3f0723 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
 		b == 0xf0 || b == 0xf2 || b == 0xf3
 		/* Group 2 */
 		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-		|| b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+		|| b == 0x64 || b == 0x65
 		/* Group 3 */
 		|| b == 0x66
 		/* Group 4 */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96..4962f1aeda6 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
 #include <asm/dma.h>
 #include <asm/numa.h>
 #include <asm/acpi.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590..8be8c7d7bc8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 				KERNEL_PGD_PTRS);
-		paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
-					 __pa(swapper_pg_dir) >> PAGE_SHIFT,
-					 KERNEL_PGD_BOUNDARY,
-					 KERNEL_PGD_PTRS);
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab666..49358481c73 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
    want false sharing in the per cpu data segment. */
 static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
 
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+
 /*
  * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
 	union smp_flush_state *f;
 
 	/* Caller has disabled preemption */
-	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+	sender = this_cpu_read(tlb_vector_offset);
 	f = &flush_state[sender];
 
 	/*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	flush_tlb_others_ipi(cpumask, mm, va);
 }
 
+static void __cpuinit calculate_tlb_offset(void)
+{
+	int cpu, node, nr_node_vecs;
+	/*
+	 * we are changing tlb_vector_offset for each CPU in runtime, but this
+	 * will not cause inconsistency, as the write is atomic under X86. we
+	 * might see more lock contentions in a short time, but after all CPU's
+	 * tlb_vector_offset are changed, everything should go normal
+	 *
+	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+	 * waste some vectors.
+	 **/
+	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+		nr_node_vecs = 1;
+	else
+		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+
+	for_each_online_node(node) {
+		int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+			nr_node_vecs;
+		int cpu_offset = 0;
+		for_each_cpu(cpu, cpumask_of_node(node)) {
+			per_cpu(tlb_vector_offset, cpu) = node_offset +
+				cpu_offset;
+			cpu_offset++;
+			cpu_offset = cpu_offset % nr_node_vecs;
+		}
+	}
+}
+
+static int tlb_cpuhp_notify(struct notifier_block *n,
+		unsigned long action, void *hcpu)
+{
+	switch (action & 0xf) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		calculate_tlb_offset();
+	}
+	return NOTIFY_OK;
+}
+
 static int __cpuinit init_smp_flush(void)
 {
 	int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
 	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
 		raw_spin_lock_init(&flush_state[i].tlbstate_lock);
 
+	calculate_tlb_offset();
+	hotcpu_notifier(tlb_cpuhp_notify, 0);
 	return 0;
 }
 core_initcall(init_smp_flush);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 3855096c59b..2d49d4e19a3 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -14,6 +14,7 @@
 #include <asm/ptrace.h>
 #include <asm/uaccess.h>
 #include <asm/stacktrace.h>
+#include <linux/compat.h>
 
 static void backtrace_warning_symbol(void *data, char *msg,
 				     unsigned long symbol)
@@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = {
 	.walk_stack	= print_context_stack,
 };
 
-struct frame_head {
-	struct frame_head *bp;
-	unsigned long ret;
-} __attribute__((packed));
-
-static struct frame_head *dump_user_backtrace(struct frame_head *head)
+#ifdef CONFIG_COMPAT
+static struct stack_frame_ia32 *
+dump_user_backtrace_32(struct stack_frame_ia32 *head)
 {
-	struct frame_head bufhead[2];
+	struct stack_frame_ia32 bufhead[2];
+	struct stack_frame_ia32 *fp;
 
 	/* Also check accessibility of one struct frame_head beyond */
 	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
@@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head)
 	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
 		return NULL;
 
-	oprofile_add_trace(bufhead[0].ret);
+	fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
+
+	oprofile_add_trace(bufhead[0].return_address);
+
+	/* frame pointers should strictly progress back up the stack
+	* (towards higher addresses) */
+	if (head >= fp)
+		return NULL;
+
+	return fp;
+}
+
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+	struct stack_frame_ia32 *head;
+
+	/* User process is 32-bit */
+	if (!current || !test_thread_flag(TIF_IA32))
+		return 0;
+
+	head = (struct stack_frame_ia32 *) regs->bp;
+	while (depth-- && head)
+		head = dump_user_backtrace_32(head);
+
+	return 1;
+}
+
+#else
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+	return 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
+{
+	struct stack_frame bufhead[2];
+
+	/* Also check accessibility of one struct stack_frame beyond */
+	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+		return NULL;
+	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+		return NULL;
+
+	oprofile_add_trace(bufhead[0].return_address);
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (head >= bufhead[0].bp)
+	if (head >= bufhead[0].next_frame)
 		return NULL;
 
-	return bufhead[0].bp;
+	return bufhead[0].next_frame;
 }
 
 void
 x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
-	struct frame_head *head = (struct frame_head *)frame_pointer(regs);
+	struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
 
 	if (!user_mode_vm(regs)) {
 		unsigned long stack = kernel_stack_pointer(regs);
@@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
+	if (x86_backtrace_32(regs, depth))
+		return;
+
 	while (depth-- && head)
 		head = dump_user_backtrace(head);
 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f1575c9a257..bd1489c3ce0 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -695,9 +695,6 @@ static int __init ppro_init(char **cpu_type)
 	return 1;
 }
 
-/* in order to get sysfs right */
-static int using_nmi;
-
 int __init op_nmi_init(struct oprofile_operations *ops)
 {
 	__u8 vendor = boot_cpu_data.x86_vendor;
@@ -705,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 	char *cpu_type = NULL;
 	int ret = 0;
 
-	using_nmi = 0;
-
 	if (!cpu_has_apic)
 		return -ENODEV;
 
@@ -790,13 +785,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 	if (ret)
 		return ret;
 
-	using_nmi = 1;
 	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
 	return 0;
 }
 
 void op_nmi_exit(void)
 {
-	if (using_nmi)
-		exit_sysfs();
+	exit_sysfs();
 }
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f5..13700ec8e2e 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
 
 int __init pci_olpc_init(void)
 {
-	printk(KERN_INFO "PCI: Using configuration type OLPC\n");
+	printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
 	raw_pci_ops = &pci_olpc_conf;
 	is_lx = is_geode_lx();
 	return 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406a..b2363fcbcd0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1969,7 +1969,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.alloc_pte = xen_alloc_pte_init,
 	.release_pte = xen_release_pte_init,
 	.alloc_pmd = xen_alloc_pmd_init,
-	.alloc_pmd_clone = paravirt_nop,
 	.release_pmd = xen_release_pmd_init,
 
 #ifdef CONFIG_X86_64
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 6b115f6c431..6afceb3d403 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -30,18 +30,13 @@
 #include <linux/slab.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <asm/mwait.h>
 
 #define ACPI_PROCESSOR_AGGREGATOR_CLASS	"acpi_pad"
 #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
 #define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
 static DEFINE_MUTEX(isolated_cpus_lock);
 
-#define MWAIT_SUBSTATE_MASK	(0xf)
-#define MWAIT_CSTATE_MASK	(0xf)
-#define MWAIT_SUBSTATE_SIZE	(4)
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK	(0x2)
 static unsigned long power_saving_mwait_eax;
 
 static unsigned char tsc_detected_unstable;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 9fc630ce1dd..f6f37a05a0c 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev,		\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
 }
 
-#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
+#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
+    defined(topology_book_cpumask)
 static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
 {
 	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
 define_one_ro_named(core_siblings, show_core_cpumask);
 define_one_ro_named(core_siblings_list, show_core_cpumask_list);
 
+#ifdef CONFIG_SCHED_BOOK
+define_id_show_func(book_id);
+define_one_ro(book_id);
+define_siblings_show_func(book_cpumask);
+define_one_ro_named(book_siblings, show_book_cpumask);
+define_one_ro_named(book_siblings_list, show_book_cpumask_list);
+#endif
+
 static struct attribute *default_attrs[] = {
 	&attr_physical_package_id.attr,
 	&attr_core_id.attr,
@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
 	&attr_thread_siblings_list.attr,
 	&attr_core_siblings.attr,
 	&attr_core_siblings_list.attr,
+#ifdef CONFIG_SCHED_BOOK
+	&attr_book_id.attr,
+	&attr_book_siblings.attr,
+	&attr_book_siblings_list.attr,
+#endif
 	NULL
 };
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da6..4b9359a6f6c 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
 
 	  If unsure, say N.
 
+config BLK_DEV_RBD
+	tristate "Rados block device (RBD)"
+	depends on INET && EXPERIMENTAL && BLOCK
+	select CEPH_LIB
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	default n
+	help
+	  Say Y here if you want include the Rados block device, which stripes
+	  a block device over objects stored in the Ceph distributed object
+	  store.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c3..d7f463d6312 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
+obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
 
 swim_mod-objs	:= swim.o swim_asm.o
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 00000000000..6ec9d53806c
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
+/*
+   rbd.c -- Export ceph rados objects as a Linux block device
+
+
+   based on drivers/block/osdblk.c:
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing rbd image.
+
+      Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
+
+      $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+      The snapshot name can be "-" or omitted to map the image read/write.
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate rados objects in the rados rbd pool
+
+      $ cat /sys/class/rbd/list
+      #id     major   client_name     pool    name    snap    KB
+      0       254     client4143      rbd     foo     -      1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - rados client id
+      - rados pool name
+      - rados block device name
+      - mapped snapshot ("-" if none)
+      - device size in KB
+
+
+   3) Create a snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+   4) Listing a snapshot.
+
+      $ cat /sys/class/rbd/snaps_list
+      #id     snap    KB
+      0       -       1024000 (*)
+      0       foo     1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - snapshot name, '-' means none (active read/write version)
+      - size of device at time of snapshot
+      - the (*) indicates this is the active version
+
+   5) Rollback to snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+   6) Mapping an image using snapshot.
+
+      A snapshot mapping is read-only. This is being done by passing
+      snap=<snapname> to the options when adding a device.
+
+      $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+   7) Remove an active blkdev<->rbd image mapping.
+
+      In this example, we remove the mapping with blkdev unique id 1.
+
+      $ echo 1 > /sys/class/rbd/remove
+
+
+   NOTE:  The actual creation and deletion of rados objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN	(96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN	64
+#define RBD_MAX_SNAP_NAME_LEN	32
+#define RBD_MAX_OPT_LEN		1024
+
+#define RBD_SNAP_HEAD_NAME	"-"
+
+#define DEV_NAME_LEN		32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+	u64 image_size;
+	char block_name[32];
+	__u8 obj_order;
+	__u8 crypt_type;
+	__u8 comp_type;
+	struct rw_semaphore snap_rwsem;
+	struct ceph_snap_context *snapc;
+	size_t snap_names_len;
+	u64 snap_seq;
+	u32 total_snaps;
+
+	char *snap_names;
+	u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client.  multiple devices may share a client.
+ */
+struct rbd_client {
+	struct ceph_client	*client;
+	struct kref		kref;
+	struct list_head	node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+	struct request		*rq;		/* blk layer request */
+	struct bio		*bio;		/* cloned bio */
+	struct page		**pages;	/* list of used pages */
+	u64			len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+	int			id;		/* blkdev unique id */
+
+	int			major;		/* blkdev assigned major */
+	struct gendisk		*disk;		/* blkdev's gendisk and rq */
+	struct request_queue	*q;
+
+	struct ceph_client	*client;
+	struct rbd_client	*rbd_client;
+
+	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+	spinlock_t		lock;		/* queue lock */
+
+	struct rbd_image_header	header;
+	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+	int			obj_len;
+	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+	char			pool_name[RBD_MAX_POOL_NAME_LEN];
+	int			poolid;
+
+	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
+	u32 cur_snap;	/* index+1 of current snapshot within snap context
+			   0 - for the head */
+	int read_only;
+
+	struct list_head	node;
+};
+
+static spinlock_t node_lock;      /* protects client get/put */
+
+static struct class *class_rbd;	  /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list);    /* devices */
+static LIST_HEAD(rbd_client_list);      /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct rbd_device *rbd_dev = disk->private_data;
+
+	set_device_ro(bdev, rbd_dev->read_only);
+
+	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+		return -EROFS;
+
+	return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+	.owner			= THIS_MODULE,
+	.open			= rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+	struct rbd_client *rbdc;
+	int ret = -ENOMEM;
+
+	dout("rbd_client_create\n");
+	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+	if (!rbdc)
+		goto out_opt;
+
+	kref_init(&rbdc->kref);
+	INIT_LIST_HEAD(&rbdc->node);
+
+	rbdc->client = ceph_create_client(opt, rbdc);
+	if (IS_ERR(rbdc->client))
+		goto out_rbdc;
+	opt = NULL; /* Now rbdc->client is responsible for opt */
+
+	ret = ceph_open_session(rbdc->client);
+	if (ret < 0)
+		goto out_err;
+
+	spin_lock(&node_lock);
+	list_add_tail(&rbdc->node, &rbd_client_list);
+	spin_unlock(&node_lock);
+
+	dout("rbd_client_create created %p\n", rbdc);
+	return rbdc;
+
+out_err:
+	ceph_destroy_client(rbdc->client);
+out_rbdc:
+	kfree(rbdc);
+out_opt:
+	if (opt)
+		ceph_destroy_options(opt);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+	struct rbd_client *client_node;
+
+	if (opt->flags & CEPH_OPT_NOSHARE)
+		return NULL;
+
+	list_for_each_entry(client_node, &rbd_client_list, node)
+		if (ceph_compare_options(opt, client_node->client) == 0)
+			return client_node;
+	return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+			  char *options)
+{
+	struct rbd_client *rbdc;
+	struct ceph_options *opt;
+	int ret;
+
+	ret = ceph_parse_options(&opt, options, mon_addr,
+				 mon_addr + strlen(mon_addr), NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	spin_lock(&node_lock);
+	rbdc = __rbd_client_find(opt);
+	if (rbdc) {
+		ceph_destroy_options(opt);
+
+		/* using an existing client */
+		kref_get(&rbdc->kref);
+		rbd_dev->rbd_client = rbdc;
+		rbd_dev->client = rbdc->client;
+		spin_unlock(&node_lock);
+		return 0;
+	}
+	spin_unlock(&node_lock);
+
+	rbdc = rbd_client_create(opt);
+	if (IS_ERR(rbdc))
+		return PTR_ERR(rbdc);
+
+	rbd_dev->rbd_client = rbdc;
+	rbd_dev->client = rbdc->client;
+	return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+	dout("rbd_release_client %p\n", rbdc);
+	spin_lock(&node_lock);
+	list_del(&rbdc->node);
+	spin_unlock(&node_lock);
+
+	ceph_destroy_client(rbdc->client);
+	kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+	rbd_dev->rbd_client = NULL;
+	rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+				 struct rbd_image_header_ondisk *ondisk,
+				 int allocated_snaps,
+				 gfp_t gfp_flags)
+{
+	int i;
+	u32 snap_count = le32_to_cpu(ondisk->snap_count);
+	int ret = -ENOMEM;
+
+	init_rwsem(&header->snap_rwsem);
+
+	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+				snap_count *
+				 sizeof(struct rbd_image_snap_ondisk),
+				gfp_flags);
+	if (!header->snapc)
+		return -ENOMEM;
+	if (snap_count) {
+		header->snap_names = kmalloc(header->snap_names_len,
+					     GFP_KERNEL);
+		if (!header->snap_names)
+			goto err_snapc;
+		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+					     GFP_KERNEL);
+		if (!header->snap_sizes)
+			goto err_names;
+	} else {
+		header->snap_names = NULL;
+		header->snap_sizes = NULL;
+	}
+	memcpy(header->block_name, ondisk->block_name,
+	       sizeof(ondisk->block_name));
+
+	header->image_size = le64_to_cpu(ondisk->image_size);
+	header->obj_order = ondisk->options.order;
+	header->crypt_type = ondisk->options.crypt_type;
+	header->comp_type = ondisk->options.comp_type;
+
+	atomic_set(&header->snapc->nref, 1);
+	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+	header->snapc->num_snaps = snap_count;
+	header->total_snaps = snap_count;
+
+	if (snap_count &&
+	    allocated_snaps == snap_count) {
+		for (i = 0; i < snap_count; i++) {
+			header->snapc->snaps[i] =
+				le64_to_cpu(ondisk->snaps[i].id);
+			header->snap_sizes[i] =
+				le64_to_cpu(ondisk->snaps[i].image_size);
+		}
+
+		/* copy snapshot names */
+		memcpy(header->snap_names, &ondisk->snaps[i],
+			header->snap_names_len);
+	}
+
+	return 0;
+
+err_names:
+	kfree(header->snap_names);
+err_snapc:
+	kfree(header->snapc);
+	return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+	return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+	struct rbd_image_header *header = &rbd_dev->header;
+
+	if (!rbd_dev->cur_snap)
+		return 0;
+
+	return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+			u64 *seq, u64 *size)
+{
+	int i;
+	char *p = header->snap_names;
+
+	for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+		if (strcmp(snap_name, p) == 0)
+			break;
+	}
+	if (i == header->total_snaps)
+		return -ENOENT;
+	if (seq)
+		*seq = header->snapc->snaps[i];
+
+	if (size)
+		*size = header->snap_sizes[i];
+
+	return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+			       const char *snap_name,
+			       u64 *size)
+{
+	struct rbd_image_header *header = &dev->header;
+	struct ceph_snap_context *snapc = header->snapc;
+	int ret = -ENOENT;
+
+	down_write(&header->snap_rwsem);
+
+	if (!snap_name ||
+	    !*snap_name ||
+	    strcmp(snap_name, "-") == 0 ||
+	    strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+		if (header->total_snaps)
+			snapc->seq = header->snap_seq;
+		else
+			snapc->seq = 0;
+		dev->cur_snap = 0;
+		dev->read_only = 0;
+		if (size)
+			*size = header->image_size;
+	} else {
+		ret = snap_by_name(header, snap_name, &snapc->seq, size);
+		if (ret < 0)
+			goto done;
+
+		dev->cur_snap = header->total_snaps - ret;
+		dev->read_only = 1;
+	}
+
+	ret = 0;
+done:
+	up_write(&header->snap_rwsem);
+	return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+	kfree(header->snapc);
+	kfree(header->snap_names);
+	kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+			   const char *block_name,
+			   u64 ofs, u64 len,
+			   char *seg_name, u64 *segofs)
+{
+	u64 seg = ofs >> header->obj_order;
+
+	if (seg_name)
+		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+			 "%s.%012llx", block_name, seg);
+
+	ofs = ofs & ((1 << header->obj_order) - 1);
+	len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+	if (segofs)
+		*segofs = ofs;
+
+	return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+	struct bio *tmp;
+
+	while (chain) {
+		tmp = chain;
+		chain = chain->bi_next;
+		bio_put(tmp);
+	}
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+	struct bio_vec *bv;
+	unsigned long flags;
+	void *buf;
+	int i;
+	int pos = 0;
+
+	while (chain) {
+		bio_for_each_segment(bv, chain, i) {
+			if (pos + bv->bv_len > start_ofs) {
+				int remainder = max(start_ofs - pos, 0);
+				buf = bvec_kmap_irq(bv, &flags);
+				memset(buf + remainder, 0,
+				       bv->bv_len - remainder);
+				bvec_kunmap_irq(buf, &flags);
+			}
+			pos += bv->bv_len;
+		}
+
+		chain = chain->bi_next;
+	}
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+				   struct bio_pair **bp,
+				   int len, gfp_t gfpmask)
+{
+	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+	int total = 0;
+
+	if (*bp) {
+		bio_pair_release(*bp);
+		*bp = NULL;
+	}
+
+	while (old_chain && (total < len)) {
+		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+		if (!tmp)
+			goto err_out;
+
+		if (total + old_chain->bi_size > len) {
+			struct bio_pair *bp;
+
+			/*
+			 * this split can only happen with a single paged bio,
+			 * split_bio will BUG_ON if this is not the case
+			 */
+			dout("bio_chain_clone split! total=%d remaining=%d"
+			     "bi_size=%d\n",
+			     (int)total, (int)len-total,
+			     (int)old_chain->bi_size);
+
+			/* split the bio. We'll release it either in the next
+			   call, or it will have to be released outside */
+			bp = bio_split(old_chain, (len - total) / 512ULL);
+			if (!bp)
+				goto err_out;
+
+			__bio_clone(tmp, &bp->bio1);
+
+			*next = &bp->bio2;
+		} else {
+			__bio_clone(tmp, old_chain);
+			*next = old_chain->bi_next;
+		}
+
+		tmp->bi_bdev = NULL;
+		gfpmask &= ~__GFP_WAIT;
+		tmp->bi_next = NULL;
+
+		if (!new_chain) {
+			new_chain = tail = tmp;
+		} else {
+			tail->bi_next = tmp;
+			tail = tmp;
+		}
+		old_chain = old_chain->bi_next;
+
+		total += tmp->bi_size;
+	}
+
+	BUG_ON(total < len);
+
+	if (tail)
+		tail->bi_next = NULL;
+
+	*old = old_chain;
+
+	return new_chain;
+
+err_out:
+	dout("bio_chain_clone with err\n");
+	bio_chain_put(new_chain);
+	return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+			    int num_ops,
+			    int opcode,
+			    u32 payload_len)
+{
+	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+		       GFP_NOIO);
+	if (!*ops)
+		return -ENOMEM;
+	(*ops)[0].op = opcode;
+	/*
+	 * op extent offset and length will be set later on
+	 * in calc_raw_layout()
+	 */
+	(*ops)[0].payload_len = payload_len;
+	return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+	kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+			  struct rbd_device *dev,
+			  struct ceph_snap_context *snapc,
+			  u64 snapid,
+			  const char *obj, u64 ofs, u64 len,
+			  struct bio *bio,
+			  struct page **pages,
+			  int num_pages,
+			  int flags,
+			  struct ceph_osd_req_op *ops,
+			  int num_reply,
+			  void (*rbd_cb)(struct ceph_osd_request *req,
+					 struct ceph_msg *msg))
+{
+	struct ceph_osd_request *req;
+	struct ceph_file_layout *layout;
+	int ret;
+	u64 bno;
+	struct timespec mtime = CURRENT_TIME;
+	struct rbd_request *req_data;
+	struct ceph_osd_request_head *reqhead;
+	struct rbd_image_header *header = &dev->header;
+
+	ret = -ENOMEM;
+	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+	if (!req_data)
+		goto done;
+
+	dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+	down_read(&header->snap_rwsem);
+
+	req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+				      snapc,
+				      ops,
+				      false,
+				      GFP_NOIO, pages, bio);
+	if (IS_ERR(req)) {
+		up_read(&header->snap_rwsem);
+		ret = PTR_ERR(req);
+		goto done_pages;
+	}
+
+	req->r_callback = rbd_cb;
+
+	req_data->rq = rq;
+	req_data->bio = bio;
+	req_data->pages = pages;
+	req_data->len = len;
+
+	req->r_priv = req_data;
+
+	reqhead = req->r_request->front.iov_base;
+	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+	strncpy(req->r_oid, obj, sizeof(req->r_oid));
+	req->r_oid_len = strlen(req->r_oid);
+
+	layout = &req->r_file_layout;
+	memset(layout, 0, sizeof(*layout));
+	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	layout->fl_stripe_count = cpu_to_le32(1);
+	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	layout->fl_pg_preferred = cpu_to_le32(-1);
+	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+	ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+			     ofs, &len, &bno, req, ops);
+
+	ceph_osdc_build_request(req, ofs, &len,
+				ops,
+				snapc,
+				&mtime,
+				req->r_oid, req->r_oid_len);
+	up_read(&header->snap_rwsem);
+
+	ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+	if (ret < 0)
+		goto done_err;
+
+	if (!rbd_cb) {
+		ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+		ceph_osdc_put_request(req);
+	}
+	return ret;
+
+done_err:
+	bio_chain_put(req_data->bio);
+	ceph_osdc_put_request(req);
+done_pages:
+	kfree(req_data);
+done:
+	if (rq)
+		blk_end_request(rq, ret, len);
+	return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+	struct rbd_request *req_data = req->r_priv;
+	struct ceph_osd_reply_head *replyhead;
+	struct ceph_osd_op *op;
+	__s32 rc;
+	u64 bytes;
+	int read_op;
+
+	/* parse reply */
+	replyhead = msg->front.iov_base;
+	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+	op = (void *)(replyhead + 1);
+	rc = le32_to_cpu(replyhead->result);
+	bytes = le64_to_cpu(op->extent.length);
+	read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+	if (rc == -ENOENT && read_op) {
+		zero_bio_chain(req_data->bio, 0);
+		rc = 0;
+	} else if (rc == 0 && read_op && bytes < req_data->len) {
+		zero_bio_chain(req_data->bio, bytes);
+		bytes = req_data->len;
+	}
+
+	blk_end_request(req_data->rq, rc, bytes);
+
+	if (req_data->bio)
+		bio_chain_put(req_data->bio);
+
+	ceph_osdc_put_request(req);
+	kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+			   struct ceph_snap_context *snapc,
+			   u64 snapid,
+			   int opcode,
+			   int flags,
+			   struct ceph_osd_req_op *orig_ops,
+			   int num_reply,
+			   const char *obj,
+			   u64 ofs, u64 len,
+			   char *buf)
+{
+	int ret;
+	struct page **pages;
+	int num_pages;
+	struct ceph_osd_req_op *ops = orig_ops;
+	u32 payload_len;
+
+	num_pages = calc_pages_for(ofs , len);
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	if (!orig_ops) {
+		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+		if (ret < 0)
+			goto done;
+
+		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+			if (ret < 0)
+				goto done_ops;
+		}
+	}
+
+	ret = rbd_do_request(NULL, dev, snapc, snapid,
+			  obj, ofs, len, NULL,
+			  pages, num_pages,
+			  flags,
+			  ops,
+			  2,
+			  NULL);
+	if (ret < 0)
+		goto done_ops;
+
+	if ((flags & CEPH_OSD_FLAG_READ) && buf)
+		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+	if (!orig_ops)
+		rbd_destroy_ops(ops);
+done:
+	ceph_release_page_vector(pages, num_pages);
+	return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+		     struct rbd_device *rbd_dev ,
+		     struct ceph_snap_context *snapc,
+		     u64 snapid,
+		     int opcode, int flags, int num_reply,
+		     u64 ofs, u64 len,
+		     struct bio *bio)
+{
+	char *seg_name;
+	u64 seg_ofs;
+	u64 seg_len;
+	int ret;
+	struct ceph_osd_req_op *ops;
+	u32 payload_len;
+
+	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+	if (!seg_name)
+		return -ENOMEM;
+
+	seg_len = rbd_get_segment(&rbd_dev->header,
+				  rbd_dev->header.block_name,
+				  ofs, len,
+				  seg_name, &seg_ofs);
+
+	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+	if (ret < 0)
+		goto done;
+
+	/* we've taken care of segment sizes earlier when we
+	   cloned the bios. We should never have a segment
+	   truncated at this point */
+	BUG_ON(seg_len < len);
+
+	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+			     seg_name, seg_ofs, seg_len,
+			     bio,
+			     NULL, 0,
+			     flags,
+			     ops,
+			     num_reply,
+			     rbd_req_cb);
+done:
+	kfree(seg_name);
+	return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+			 struct rbd_device *rbd_dev,
+			 struct ceph_snap_context *snapc,
+			 u64 ofs, u64 len,
+			 struct bio *bio)
+{
+	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+			 CEPH_OSD_OP_WRITE,
+			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+			 2,
+			 ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+			 struct rbd_device *rbd_dev,
+			 u64 snapid,
+			 u64 ofs, u64 len,
+			 struct bio *bio)
+{
+	return rbd_do_op(rq, rbd_dev, NULL,
+			 (snapid ? snapid : CEPH_NOSNAP),
+			 CEPH_OSD_OP_READ,
+			 CEPH_OSD_FLAG_READ,
+			 2,
+			 ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+			  struct ceph_snap_context *snapc,
+			  u64 snapid,
+			  const char *obj,
+			  u64 ofs, u64 len,
+			  char *buf)
+{
+	return rbd_req_sync_op(dev, NULL,
+			       (snapid ? snapid : CEPH_NOSNAP),
+			       CEPH_OSD_OP_READ,
+			       CEPH_OSD_FLAG_READ,
+			       NULL,
+			       1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+				     u64 snapid,
+				     const char *obj)
+{
+	struct ceph_osd_req_op *ops;
+	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+	if (ret < 0)
+		return ret;
+
+	ops[0].snap.snapid = snapid;
+
+	ret = rbd_req_sync_op(dev, NULL,
+			       CEPH_NOSNAP,
+			       0,
+			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+			       ops,
+			       1, obj, 0, 0, NULL);
+
+	rbd_destroy_ops(ops);
+
+	if (ret < 0)
+		return ret;
+
+	return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+			     const char *obj,
+			     const char *cls,
+			     const char *method,
+			     const char *data,
+			     int len)
+{
+	struct ceph_osd_req_op *ops;
+	int cls_len = strlen(cls);
+	int method_len = strlen(method);
+	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+				    cls_len + method_len + len);
+	if (ret < 0)
+		return ret;
+
+	ops[0].cls.class_name = cls;
+	ops[0].cls.class_len = (__u8)cls_len;
+	ops[0].cls.method_name = method;
+	ops[0].cls.method_len = (__u8)method_len;
+	ops[0].cls.argc = 0;
+	ops[0].cls.indata = data;
+	ops[0].cls.indata_len = len;
+
+	ret = rbd_req_sync_op(dev, NULL,
+			       CEPH_NOSNAP,
+			       0,
+			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+			       ops,
+			       1, obj, 0, 0, NULL);
+
+	rbd_destroy_ops(ops);
+
+	dout("cls_exec returned %d\n", ret);
+	return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+	struct rbd_device *rbd_dev = q->queuedata;
+	struct request *rq;
+	struct bio_pair *bp = NULL;
+
+	rq = blk_fetch_request(q);
+
+	while (1) {
+		struct bio *bio;
+		struct bio *rq_bio, *next_bio = NULL;
+		bool do_write;
+		int size, op_size = 0;
+		u64 ofs;
+
+		/* peek at request from block layer */
+		if (!rq)
+			break;
+
+		dout("fetched request\n");
+
+		/* filter out block requests we don't understand */
+		if ((rq->cmd_type != REQ_TYPE_FS)) {
+			__blk_end_request_all(rq, 0);
+			goto next;
+		}
+
+		/* deduce our operation (read, write) */
+		do_write = (rq_data_dir(rq) == WRITE);
+
+		size = blk_rq_bytes(rq);
+		ofs = blk_rq_pos(rq) * 512ULL;
+		rq_bio = rq->bio;
+		if (do_write && rbd_dev->read_only) {
+			__blk_end_request_all(rq, -EROFS);
+			goto next;
+		}
+
+		spin_unlock_irq(q->queue_lock);
+
+		dout("%s 0x%x bytes at 0x%llx\n",
+		     do_write ? "write" : "read",
+		     size, blk_rq_pos(rq) * 512ULL);
+
+		do {
+			/* a bio clone to be passed down to OSD req */
+			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+			op_size = rbd_get_segment(&rbd_dev->header,
+						  rbd_dev->header.block_name,
+						  ofs, size,
+						  NULL, NULL);
+			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+					      op_size, GFP_ATOMIC);
+			if (!bio) {
+				spin_lock_irq(q->queue_lock);
+				__blk_end_request_all(rq, -ENOMEM);
+				goto next;
+			}
+
+			/* init OSD command: write or read */
+			if (do_write)
+				rbd_req_write(rq, rbd_dev,
+					      rbd_dev->header.snapc,
+					      ofs,
+					      op_size, bio);
+			else
+				rbd_req_read(rq, rbd_dev,
+					     cur_snap_id(rbd_dev),
+					     ofs,
+					     op_size, bio);
+
+			size -= op_size;
+			ofs += op_size;
+
+			rq_bio = next_bio;
+		} while (size > 0);
+
+		if (bp)
+			bio_pair_release(bp);
+
+		spin_lock_irq(q->queue_lock);
+next:
+		rq = blk_fetch_request(q);
+	}
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+			  struct bio_vec *bvec)
+{
+	struct rbd_device *rbd_dev = q->queuedata;
+	unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+	sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+	unsigned int bio_sectors = bmd->bi_size >> 9;
+	int max;
+
+	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
+				 + bio_sectors)) << 9;
+	if (max < 0)
+		max = 0; /* bio_add cannot handle a negative return */
+	if (max <= bvec->bv_len && bio_sectors == 0)
+		return bvec->bv_len;
+	return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+	struct gendisk *disk = rbd_dev->disk;
+
+	if (!disk)
+		return;
+
+	rbd_header_free(&rbd_dev->header);
+
+	if (disk->flags & GENHD_FL_UP)
+		del_gendisk(disk);
+	if (disk->queue)
+		blk_cleanup_queue(disk->queue);
+	put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header 
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+			   struct rbd_image_header *header)
+{
+	ssize_t rc;
+	struct rbd_image_header_ondisk *dh;
+	int snap_count = 0;
+	u64 snap_names_len = 0;
+
+	while (1) {
+		int len = sizeof(*dh) +
+			  snap_count * sizeof(struct rbd_image_snap_ondisk) +
+			  snap_names_len;
+
+		rc = -ENOMEM;
+		dh = kmalloc(len, GFP_KERNEL);
+		if (!dh)
+			return -ENOMEM;
+
+		rc = rbd_req_sync_read(rbd_dev,
+				       NULL, CEPH_NOSNAP,
+				       rbd_dev->obj_md_name,
+				       0, len,
+				       (char *)dh);
+		if (rc < 0)
+			goto out_dh;
+
+		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+		if (rc < 0)
+			goto out_dh;
+
+		if (snap_count != header->total_snaps) {
+			snap_count = header->total_snaps;
+			snap_names_len = header->snap_names_len;
+			rbd_header_free(header);
+			kfree(dh);
+			continue;
+		}
+		break;
+	}
+
+out_dh:
+	kfree(dh);
+	return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+			       const char *snap_name,
+			       gfp_t gfp_flags)
+{
+	int name_len = strlen(snap_name);
+	u64 new_snapid;
+	int ret;
+	void *data, *data_start, *data_end;
+
+	/* we should create a snapshot only if we're pointing at the head */
+	if (dev->cur_snap)
+		return -EINVAL;
+
+	ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+				      &new_snapid);
+	dout("created snapid=%lld\n", new_snapid);
+	if (ret < 0)
+		return ret;
+
+	data = kmalloc(name_len + 16, gfp_flags);
+	if (!data)
+		return -ENOMEM;
+
+	data_start = data;
+	data_end = data + name_len + 16;
+
+	ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+	ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+	ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+				data_start, data - data_start);
+
+	kfree(data_start);
+
+	if (ret < 0)
+		return ret;
+
+	dev->header.snapc->seq =  new_snapid;
+
+	return 0;
+bad:
+	return -ERANGE;
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+	int ret;
+	struct rbd_image_header h;
+	u64 snap_seq;
+
+	ret = rbd_read_header(rbd_dev, &h);
+	if (ret < 0)
+		return ret;
+
+	down_write(&rbd_dev->header.snap_rwsem);
+
+	snap_seq = rbd_dev->header.snapc->seq;
+
+	kfree(rbd_dev->header.snapc);
+	kfree(rbd_dev->header.snap_names);
+	kfree(rbd_dev->header.snap_sizes);
+
+	rbd_dev->header.total_snaps = h.total_snaps;
+	rbd_dev->header.snapc = h.snapc;
+	rbd_dev->header.snap_names = h.snap_names;
+	rbd_dev->header.snap_sizes = h.snap_sizes;
+	rbd_dev->header.snapc->seq = snap_seq;
+
+	up_write(&rbd_dev->header.snap_rwsem);
+
+	return 0;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	int rc;
+	u64 total_size = 0;
+
+	/* contact OSD, request size info about the object being mapped */
+	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+	if (rc)
+		return rc;
+
+	rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+	if (rc)
+		return rc;
+
+	/* create gendisk info */
+	rc = -ENOMEM;
+	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+	if (!disk)
+		goto out;
+
+	sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+	disk->major = rbd_dev->major;
+	disk->first_minor = 0;
+	disk->fops = &rbd_bd_ops;
+	disk->private_data = rbd_dev;
+
+	/* init rq */
+	rc = -ENOMEM;
+	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+	if (!q)
+		goto out_disk;
+	blk_queue_merge_bvec(q, rbd_merge_bvec);
+	disk->queue = q;
+
+	q->queuedata = rbd_dev;
+
+	rbd_dev->disk = disk;
+	rbd_dev->q = q;
+
+	/* finally, announce the disk to the world */
+	set_capacity(disk, total_size / 512ULL);
+	add_disk(disk);
+
+	pr_info("%s: added with size 0x%llx\n",
+		disk->disk_name, (unsigned long long)total_size);
+	return 0;
+
+out_disk:
+	put_disk(disk);
+out:
+	return rc;
+}
+
+/********************************************************************
+ * /sys/class/rbd/
+ *                   add	map rados objects to blkdev
+ *                   remove	unmap rados objects
+ *                   list	show mappings
+ *******************************************************************/
+
+static void class_rbd_release(struct class *cls)
+{
+	kfree(cls);
+}
+
+static ssize_t class_rbd_list(struct class *c,
+			      struct class_attribute *attr,
+			      char *data)
+{
+	int n = 0;
+	struct list_head *tmp;
+	int max = PAGE_SIZE;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	n += snprintf(data, max,
+		      "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
+
+	list_for_each(tmp, &rbd_dev_list) {
+		struct rbd_device *rbd_dev;
+
+		rbd_dev = list_entry(tmp, struct rbd_device, node);
+		n += snprintf(data+n, max-n,
+			      "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
+			      rbd_dev->id,
+			      rbd_dev->major,
+			      ceph_client_id(rbd_dev->client),
+			      rbd_dev->pool_name,
+			      rbd_dev->obj, rbd_dev->snap_name,
+			      rbd_dev->header.image_size >> 10);
+		if (n == max)
+			break;
+	}
+
+	mutex_unlock(&ctl_mutex);
+	return n;
+}
+
+static ssize_t class_rbd_add(struct class *c,
+			     struct class_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct ceph_osd_client *osdc;
+	struct rbd_device *rbd_dev;
+	ssize_t rc = -ENOMEM;
+	int irc, new_id = 0;
+	struct list_head *tmp;
+	char *mon_dev_name;
+	char *options;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+	if (!mon_dev_name)
+		goto err_out_mod;
+
+	options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+	if (!options)
+		goto err_mon_dev;
+
+	/* new rbd_device object */
+	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+	if (!rbd_dev)
+		goto err_out_opt;
+
+	/* static rbd_device initialization */
+	spin_lock_init(&rbd_dev->lock);
+	INIT_LIST_HEAD(&rbd_dev->node);
+
+	/* generate unique id: find highest unique id, add one */
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &rbd_dev_list) {
+		struct rbd_device *rbd_dev;
+
+		rbd_dev = list_entry(tmp, struct rbd_device, node);
+		if (rbd_dev->id >= new_id)
+			new_id = rbd_dev->id + 1;
+	}
+
+	rbd_dev->id = new_id;
+
+	/* add to global list */
+	list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+	/* parse add command */
+	if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+		   "%" __stringify(RBD_MAX_OPT_LEN) "s "
+		   "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+		   "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+		   "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+		   mon_dev_name, options, rbd_dev->pool_name,
+		   rbd_dev->obj, rbd_dev->snap_name) < 4) {
+		rc = -EINVAL;
+		goto err_out_slot;
+	}
+
+	if (rbd_dev->snap_name[0] == 0)
+		rbd_dev->snap_name[0] = '-';
+
+	rbd_dev->obj_len = strlen(rbd_dev->obj);
+	snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+		 rbd_dev->obj, RBD_SUFFIX);
+
+	/* initialize rest of new object */
+	snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+	rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+	if (rc < 0)
+		goto err_out_slot;
+
+	mutex_unlock(&ctl_mutex);
+
+	/* pick the pool */
+	osdc = &rbd_dev->client->osdc;
+	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+	if (rc < 0)
+		goto err_out_client;
+	rbd_dev->poolid = rc;
+
+	/* register our block device */
+	irc = register_blkdev(0, rbd_dev->name);
+	if (irc < 0) {
+		rc = irc;
+		goto err_out_client;
+	}
+	rbd_dev->major = irc;
+
+	/* set up and announce blkdev mapping */
+	rc = rbd_init_disk(rbd_dev);
+	if (rc)
+		goto err_out_blkdev;
+
+	return count;
+
+err_out_blkdev:
+	unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+	rbd_put_client(rbd_dev);
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+	list_del_init(&rbd_dev->node);
+	mutex_unlock(&ctl_mutex);
+
+	kfree(rbd_dev);
+err_out_opt:
+	kfree(options);
+err_mon_dev:
+	kfree(mon_dev_name);
+err_out_mod:
+	dout("Error adding device %s\n", buf);
+	module_put(THIS_MODULE);
+	return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+	struct list_head *tmp;
+	struct rbd_device *rbd_dev;
+
+	list_for_each(tmp, &rbd_dev_list) {
+		rbd_dev = list_entry(tmp, struct rbd_device, node);
+		if (rbd_dev->id == id)
+			return rbd_dev;
+	}
+	return NULL;
+}
+
+static ssize_t class_rbd_remove(struct class *c,
+				struct class_attribute *attr,
+				const char *buf,
+				size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	int target_id, rc;
+	unsigned long ul;
+
+	rc = strict_strtoul(buf, 10, &ul);
+	if (rc)
+		return rc;
+
+	/* convert to int; abort if we lost anything in the conversion */
+	target_id = (int) ul;
+	if (target_id != ul)
+		return -EINVAL;
+
+	/* remove object from list immediately */
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	rbd_dev = __rbd_get_dev(target_id);
+	if (rbd_dev)
+		list_del_init(&rbd_dev->node);
+
+	mutex_unlock(&ctl_mutex);
+
+	if (!rbd_dev)
+		return -ENOENT;
+
+	rbd_put_client(rbd_dev);
+
+	/* clean up and free blkdev */
+	rbd_free_disk(rbd_dev);
+	unregister_blkdev(rbd_dev->major, rbd_dev->name);
+	kfree(rbd_dev);
+
+	/* release module ref */
+	module_put(THIS_MODULE);
+
+	return count;
+}
+
+static ssize_t class_rbd_snaps_list(struct class *c,
+			      struct class_attribute *attr,
+			      char *data)
+{
+	struct rbd_device *rbd_dev = NULL;
+	struct list_head *tmp;
+	struct rbd_image_header *header;
+	int i, n = 0, max = PAGE_SIZE;
+	int ret;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	n += snprintf(data, max, "#id\tsnap\tKB\n");
+
+	list_for_each(tmp, &rbd_dev_list) {
+		char *names, *p;
+		struct ceph_snap_context *snapc;
+
+		rbd_dev = list_entry(tmp, struct rbd_device, node);
+		header = &rbd_dev->header;
+
+		down_read(&header->snap_rwsem);
+
+		names = header->snap_names;
+		snapc = header->snapc;
+
+		n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+			      rbd_dev->id, RBD_SNAP_HEAD_NAME,
+			      header->image_size >> 10,
+			      (!rbd_dev->cur_snap ? " (*)" : ""));
+		if (n == max)
+			break;
+
+		p = names;
+		for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+			n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+			      rbd_dev->id, p, header->snap_sizes[i] >> 10,
+			      (rbd_dev->cur_snap &&
+			       (snap_index(header, i) == rbd_dev->cur_snap) ?
+			       " (*)" : ""));
+			if (n == max)
+				break;
+		}
+
+		up_read(&header->snap_rwsem);
+	}
+
+
+	ret = n;
+	mutex_unlock(&ctl_mutex);
+	return ret;
+}
+
+static ssize_t class_rbd_snaps_refresh(struct class *c,
+				struct class_attribute *attr,
+				const char *buf,
+				size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	int target_id, rc;
+	unsigned long ul;
+	int ret = count;
+
+	rc = strict_strtoul(buf, 10, &ul);
+	if (rc)
+		return rc;
+
+	/* convert to int; abort if we lost anything in the conversion */
+	target_id = (int) ul;
+	if (target_id != ul)
+		return -EINVAL;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	rbd_dev = __rbd_get_dev(target_id);
+	if (!rbd_dev) {
+		ret = -ENOENT;
+		goto done;
+	}
+
+	rc = rbd_update_snaps(rbd_dev);
+	if (rc < 0)
+		ret = rc;
+
+done:
+	mutex_unlock(&ctl_mutex);
+	return ret;
+}
+
+static ssize_t class_rbd_snap_create(struct class *c,
+				struct class_attribute *attr,
+				const char *buf,
+				size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	int target_id, ret;
+	char *name;
+
+	name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	/* parse snaps add command */
+	if (sscanf(buf, "%d "
+		   "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+		   &target_id,
+		   name) != 2) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	rbd_dev = __rbd_get_dev(target_id);
+	if (!rbd_dev) {
+		ret = -ENOENT;
+		goto done_unlock;
+	}
+
+	ret = rbd_header_add_snap(rbd_dev,
+				  name, GFP_KERNEL);
+	if (ret < 0)
+		goto done_unlock;
+
+	ret = rbd_update_snaps(rbd_dev);
+	if (ret < 0)
+		goto done_unlock;
+
+	ret = count;
+done_unlock:
+	mutex_unlock(&ctl_mutex);
+done:
+	kfree(name);
+	return ret;
+}
+
+static ssize_t class_rbd_rollback(struct class *c,
+				struct class_attribute *attr,
+				const char *buf,
+				size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	int target_id, ret;
+	u64 snapid;
+	char snap_name[RBD_MAX_SNAP_NAME_LEN];
+	u64 cur_ofs;
+	char *seg_name;
+
+	/* parse snaps add command */
+	if (sscanf(buf, "%d "
+		   "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+		   &target_id,
+		   snap_name) != 2) {
+		return -EINVAL;
+	}
+
+	ret = -ENOMEM;
+	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+	if (!seg_name)
+		return ret;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	rbd_dev = __rbd_get_dev(target_id);
+	if (!rbd_dev) {
+		ret = -ENOENT;
+		goto done_unlock;
+	}
+
+	ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+	if (ret < 0)
+		goto done_unlock;
+
+	dout("snapid=%lld\n", snapid);
+
+	cur_ofs = 0;
+	while (cur_ofs < rbd_dev->header.image_size) {
+		cur_ofs += rbd_get_segment(&rbd_dev->header,
+					   rbd_dev->obj,
+					   cur_ofs, (u64)-1,
+					   seg_name, NULL);
+		dout("seg_name=%s\n", seg_name);
+
+		ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+		if (ret < 0)
+			pr_warning("could not roll back obj %s err=%d\n",
+				   seg_name, ret);
+	}
+
+	ret = rbd_update_snaps(rbd_dev);
+	if (ret < 0)
+		goto done_unlock;
+
+	ret = count;
+
+done_unlock:
+	mutex_unlock(&ctl_mutex);
+	kfree(seg_name);
+
+	return ret;
+}
+
+static struct class_attribute class_rbd_attrs[] = {
+	__ATTR(add,		0200, NULL, class_rbd_add),
+	__ATTR(remove,		0200, NULL, class_rbd_remove),
+	__ATTR(list,		0444, class_rbd_list, NULL),
+	__ATTR(snaps_refresh,	0200, NULL, class_rbd_snaps_refresh),
+	__ATTR(snap_create,	0200, NULL, class_rbd_snap_create),
+	__ATTR(snaps_list,	0444, class_rbd_snaps_list, NULL),
+	__ATTR(snap_rollback,	0200, NULL, class_rbd_rollback),
+	__ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/class/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+	int ret = -ENOMEM;
+
+	class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
+	if (!class_rbd)
+		goto out;
+
+	class_rbd->name = DRV_NAME;
+	class_rbd->owner = THIS_MODULE;
+	class_rbd->class_release = class_rbd_release;
+	class_rbd->class_attrs = class_rbd_attrs;
+
+	ret = class_register(class_rbd);
+	if (ret)
+		goto out_class;
+	return 0;
+
+out_class:
+	kfree(class_rbd);
+	class_rbd = NULL;
+	pr_err(DRV_NAME ": failed to create class rbd\n");
+out:
+	return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+	if (class_rbd)
+		class_destroy(class_rbd);
+	class_rbd = NULL;
+}
+
+int __init rbd_init(void)
+{
+	int rc;
+
+	rc = rbd_sysfs_init();
+	if (rc)
+		return rc;
+	spin_lock_init(&node_lock);
+	pr_info("loaded " DRV_NAME_LONG "\n");
+	return 0;
+}
+
+void __exit rbd_exit(void)
+{
+	rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 00000000000..fc6c678aa2c
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   foo.00000000
+ *   foo.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX		".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER	22   /* 4MB */
+#define RBD_MIN_OBJ_ORDER       16
+#define RBD_MAX_OBJ_ORDER       30
+
+#define RBD_MAX_OBJ_NAME_LEN	96
+#define RBD_MAX_SEG_NAME_LEN	128
+
+#define RBD_COMP_NONE		0
+#define RBD_CRYPT_NONE		0
+
+#define RBD_HEADER_TEXT		"<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE	"RBD"
+#define RBD_HEADER_VERSION	"001.005"
+
+struct rbd_info {
+	__le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+	__le64 id;
+	__le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+	char text[40];
+	char block_name[24];
+	char signature[4];
+	char version[8];
+	struct {
+		__u8 order;
+		__u8 crypt_type;
+		__u8 comp_type;
+		__u8 unused;
+	} __attribute__((packed)) options;
+	__le64 image_size;
+	__le64 snap_seq;
+	__le32 snap_count;
+	__le32 reserved;
+	__le64 snap_names_len;
+	struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e251a62..8320490226b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
-#include <linux/smp_lock.h>
 #include <linux/hdreg.h>
 #include <linux/virtio.h>
 #include <linux/virtio_blk.h>
@@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	return err;
 }
 
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
-			 unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long data)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
 			      (void __user *)data);
 }
 
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
-			     unsigned int cmd, unsigned long param)
-{
-	int ret;
-
-	lock_kernel();
-	ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
-	unlock_kernel();
-
-	return ret;
-}
-
 /* We provide getgeo only to please some old bootloader/partitioning tools */
 static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 {
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig
index 4b66c69eaf5..5ddf67e76f8 100644
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -57,7 +57,7 @@ config AGP_AMD
 
 config AGP_AMD64
 	tristate "AMD Opteron/Athlon64 on-CPU GART support"
-	depends on AGP && X86 && K8_NB
+	depends on AGP && X86 && AMD_NB
 	help
 	  This option gives you AGP support for the GLX component of
 	  X using the on-CPU northbridge of the AMD Athlon64/Opteron CPUs.
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 70312da4c96..42396df5555 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -15,7 +15,7 @@
 #include <linux/mmzone.h>
 #include <asm/page.h>		/* PAGE_SIZE */
 #include <asm/e820.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/gart.h>
 #include "agp.h"
 
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
 	u32 temp;
 	struct aper_size_info_32 *values;
 
-	dev = k8_northbridges[0];
+	dev = k8_northbridges.nb_misc[0];
 	if (dev==NULL)
 		return 0;
 
@@ -181,10 +181,14 @@ static int amd_8151_configure(void)
 	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
 	int i;
 
+	if (!k8_northbridges.gart_supported)
+		return 0;
+
 	/* Configure AGP regs in each x86-64 host bridge. */
-        for (i = 0; i < num_k8_northbridges; i++) {
+	for (i = 0; i < k8_northbridges.num; i++) {
 		agp_bridge->gart_bus_addr =
-				amd64_configure(k8_northbridges[i], gatt_bus);
+				amd64_configure(k8_northbridges.nb_misc[i],
+						gatt_bus);
 	}
 	k8_flush_garts();
 	return 0;
@@ -195,11 +199,15 @@ static void amd64_cleanup(void)
 {
 	u32 tmp;
 	int i;
-        for (i = 0; i < num_k8_northbridges; i++) {
-		struct pci_dev *dev = k8_northbridges[i];
+
+	if (!k8_northbridges.gart_supported)
+		return;
+
+	for (i = 0; i < k8_northbridges.num; i++) {
+		struct pci_dev *dev = k8_northbridges.nb_misc[i];
 		/* disable gart translation */
 		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
-		tmp &= ~AMD64_GARTEN;
+		tmp &= ~GARTEN;
 		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
 	}
 }
@@ -313,22 +321,25 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order))
 		return -1;
 
-	pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1);
+	gart_set_size_and_enable(nb, order);
 	pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25);
 
 	return 0;
 }
 
-static __devinit int cache_nbs (struct pci_dev *pdev, u32 cap_ptr)
+static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
 {
 	int i;
 
 	if (cache_k8_northbridges() < 0)
 		return -ENODEV;
 
+	if (!k8_northbridges.gart_supported)
+		return -ENODEV;
+
 	i = 0;
-	for (i = 0; i < num_k8_northbridges; i++) {
-		struct pci_dev *dev = k8_northbridges[i];
+	for (i = 0; i < k8_northbridges.num; i++) {
+		struct pci_dev *dev = k8_northbridges.nb_misc[i];
 		if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
 			dev_err(&dev->dev, "no usable aperture found\n");
 #ifdef __x86_64__
@@ -405,7 +416,8 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
 	}
 
 	/* shadow x86-64 registers into ULi registers */
-	pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &httfea);
+	pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+			       &httfea);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
 	if ((httfea & 0x7fff) >> (32 - 25)) {
@@ -472,7 +484,8 @@ static int nforce3_agp_init(struct pci_dev *pdev)
 	pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
 
 	/* shadow x86-64 registers into NVIDIA registers */
-	pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &apbase);
+	pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+			       &apbase);
 
 	/* if x86-64 aperture base is beyond 4G, exit here */
 	if ( (apbase & 0x7fff) >> (32 - 25) ) {
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index d2abf514398..64255cef8a7 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 
 	bridge->driver->cache_flush();
 #ifdef CONFIG_X86
-	set_memory_uc((unsigned long)table, 1 << page_order);
+	if (set_memory_uc((unsigned long)table, 1 << page_order))
+		printk(KERN_WARNING "Could not set GATT table memory to UC!");
+
 	bridge->gatt_table = (void *)table;
 #else
 	bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index 05ad4a17a28..7c4133582db 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -47,6 +47,16 @@ enum tpm_duration {
 #define TPM_MAX_PROTECTED_ORDINAL 12
 #define TPM_PROTECTED_ORDINAL_MASK 0xFF
 
+/*
+ * Bug workaround - some TPM's don't flush the most
+ * recently changed pcr on suspend, so force the flush
+ * with an extend to the selected _unused_ non-volatile pcr.
+ */
+static int tpm_suspend_pcr;
+module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644);
+MODULE_PARM_DESC(suspend_pcr,
+		 "PCR to use for dummy writes to faciltate flush on suspend.");
+
 static LIST_HEAD(tpm_chip_list);
 static DEFINE_SPINLOCK(driver_lock);
 static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
@@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = {
 	.ordinal = TPM_ORD_SAVESTATE
 };
 
-/* Bug workaround - some TPM's don't flush the most
- * recently changed pcr on suspend, so force the flush
- * with an extend to the selected _unused_ non-volatile pcr.
- */
-static int tpm_suspend_pcr;
-static int __init tpm_suspend_setup(char *str)
-{
-	get_option(&str, &tpm_suspend_pcr);
-	return 1;
-}
-__setup("tpm_suspend_pcr=", tpm_suspend_setup);
-
 /*
  * We are about to suspend. Save the TPM state
  * so that it can be restored.
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 0f69c5ec0ec..6c1b676643a 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -48,6 +48,9 @@ struct ports_driver_data {
 	/* Used for exporting per-port information to debugfs */
 	struct dentry *debugfs_dir;
 
+	/* List of all the devices we're handling */
+	struct list_head portdevs;
+
 	/* Number of devices this driver is handling */
 	unsigned int index;
 
@@ -108,6 +111,9 @@ struct port_buffer {
  * ports for that device (vdev->priv).
  */
 struct ports_device {
+	/* Next portdev in the list, head is in the pdrvdata struct */
+	struct list_head list;
+
 	/*
 	 * Workqueue handlers where we process deferred work after
 	 * notification
@@ -178,15 +184,21 @@ struct port {
 	struct console cons;
 
 	/* Each port associates with a separate char device */
-	struct cdev cdev;
+	struct cdev *cdev;
 	struct device *dev;
 
+	/* Reference-counting to handle port hot-unplugs and file operations */
+	struct kref kref;
+
 	/* A waitqueue for poll() or blocking read operations */
 	wait_queue_head_t waitqueue;
 
 	/* The 'name' of the port that we expose via sysfs properties */
 	char *name;
 
+	/* We can notify apps of host connect / disconnect events via SIGIO */
+	struct fasync_struct *async_queue;
+
 	/* The 'id' to identify the port with the Host */
 	u32 id;
 
@@ -221,6 +233,41 @@ out:
 	return port;
 }
 
+static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
+						 dev_t dev)
+{
+	struct port *port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&portdev->ports_lock, flags);
+	list_for_each_entry(port, &portdev->ports, list)
+		if (port->cdev->dev == dev)
+			goto out;
+	port = NULL;
+out:
+	spin_unlock_irqrestore(&portdev->ports_lock, flags);
+
+	return port;
+}
+
+static struct port *find_port_by_devt(dev_t dev)
+{
+	struct ports_device *portdev;
+	struct port *port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pdrvdata_lock, flags);
+	list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
+		port = find_port_by_devt_in_portdev(portdev, dev);
+		if (port)
+			goto out;
+	}
+	port = NULL;
+out:
+	spin_unlock_irqrestore(&pdrvdata_lock, flags);
+	return port;
+}
+
 static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
 {
 	struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
 static ssize_t send_control_msg(struct port *port, unsigned int event,
 				unsigned int value)
 {
-	return __send_control_msg(port->portdev, port->id, event, value);
+	/* Did the port get unplugged before userspace closed it? */
+	if (port->portdev)
+		return __send_control_msg(port->portdev, port->id, event, value);
+	return 0;
 }
 
 /* Callers must take the port->outvq_lock */
@@ -525,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
 /* The condition that must be true for polling to end */
 static bool will_read_block(struct port *port)
 {
+	if (!port->guest_connected) {
+		/* Port got hot-unplugged. Let's exit. */
+		return false;
+	}
 	return !port_has_data(port) && port->host_connected;
 }
 
@@ -575,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
 		if (ret < 0)
 			return ret;
 	}
+	/* Port got hot-unplugged. */
+	if (!port->guest_connected)
+		return -ENODEV;
 	/*
 	 * We could've received a disconnection message while we were
 	 * waiting for more data.
@@ -616,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
 		if (ret < 0)
 			return ret;
 	}
+	/* Port got hot-unplugged. */
+	if (!port->guest_connected)
+		return -ENODEV;
 
 	count = min((size_t)(32 * 1024), count);
 
@@ -656,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
 	port = filp->private_data;
 	poll_wait(filp, &port->waitqueue, wait);
 
+	if (!port->guest_connected) {
+		/* Port got unplugged */
+		return POLLHUP;
+	}
 	ret = 0;
 	if (!will_read_block(port))
 		ret |= POLLIN | POLLRDNORM;
@@ -667,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
 	return ret;
 }
 
+static void remove_port(struct kref *kref);
+
 static int port_fops_release(struct inode *inode, struct file *filp)
 {
 	struct port *port;
@@ -687,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
 	reclaim_consumed_buffers(port);
 	spin_unlock_irq(&port->outvq_lock);
 
+	/*
+	 * Locks aren't necessary here as a port can't be opened after
+	 * unplug, and if a port isn't unplugged, a kref would already
+	 * exist for the port.  Plus, taking ports_lock here would
+	 * create a dependency on other locks taken by functions
+	 * inside remove_port if we're the last holder of the port,
+	 * creating many problems.
+	 */
+	kref_put(&port->kref, remove_port);
+
 	return 0;
 }
 
@@ -694,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
 {
 	struct cdev *cdev = inode->i_cdev;
 	struct port *port;
+	int ret;
 
-	port = container_of(cdev, struct port, cdev);
+	port = find_port_by_devt(cdev->dev);
 	filp->private_data = port;
 
+	/* Prevent against a port getting hot-unplugged at the same time */
+	spin_lock_irq(&port->portdev->ports_lock);
+	kref_get(&port->kref);
+	spin_unlock_irq(&port->portdev->ports_lock);
+
 	/*
 	 * Don't allow opening of console port devices -- that's done
 	 * via /dev/hvc
 	 */
-	if (is_console_port(port))
-		return -ENXIO;
+	if (is_console_port(port)) {
+		ret = -ENXIO;
+		goto out;
+	}
 
 	/* Allow only one process to open a particular port at a time */
 	spin_lock_irq(&port->inbuf_lock);
 	if (port->guest_connected) {
 		spin_unlock_irq(&port->inbuf_lock);
-		return -EMFILE;
+		ret = -EMFILE;
+		goto out;
 	}
 
 	port->guest_connected = true;
@@ -724,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
 	reclaim_consumed_buffers(port);
 	spin_unlock_irq(&port->outvq_lock);
 
+	nonseekable_open(inode, filp);
+
 	/* Notify host of port being opened */
 	send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
 
 	return 0;
+out:
+	kref_put(&port->kref, remove_port);
+	return ret;
+}
+
+static int port_fops_fasync(int fd, struct file *filp, int mode)
+{
+	struct port *port;
+
+	port = filp->private_data;
+	return fasync_helper(fd, filp, mode, &port->async_queue);
 }
 
 /*
@@ -743,6 +841,8 @@ static const struct file_operations port_fops = {
 	.write = port_fops_write,
 	.poll  = port_fops_poll,
 	.release = port_fops_release,
+	.fasync = port_fops_fasync,
+	.llseek = no_llseek,
 };
 
 /*
@@ -1001,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
 	return nr_added_bufs;
 }
 
+static void send_sigio_to_port(struct port *port)
+{
+	if (port->async_queue && port->guest_connected)
+		kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
+}
+
 static int add_port(struct ports_device *portdev, u32 id)
 {
 	char debugfs_name[16];
@@ -1015,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id)
 		err = -ENOMEM;
 		goto fail;
 	}
+	kref_init(&port->kref);
 
 	port->portdev = portdev;
 	port->id = id;
@@ -1022,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id)
 	port->name = NULL;
 	port->inbuf = NULL;
 	port->cons.hvc = NULL;
+	port->async_queue = NULL;
 
 	port->cons.ws.ws_row = port->cons.ws.ws_col = 0;
 
@@ -1032,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id)
 	port->in_vq = portdev->in_vqs[port->id];
 	port->out_vq = portdev->out_vqs[port->id];
 
-	cdev_init(&port->cdev, &port_fops);
+	port->cdev = cdev_alloc();
+	if (!port->cdev) {
+		dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n");
+		err = -ENOMEM;
+		goto free_port;
+	}
+	port->cdev->ops = &port_fops;
 
 	devt = MKDEV(portdev->chr_major, id);
-	err = cdev_add(&port->cdev, devt, 1);
+	err = cdev_add(port->cdev, devt, 1);
 	if (err < 0) {
 		dev_err(&port->portdev->vdev->dev,
 			"Error %d adding cdev for port %u\n", err, id);
-		goto free_port;
+		goto free_cdev;
 	}
 	port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev,
 				  devt, port, "vport%up%u",
@@ -1104,7 +1218,7 @@ free_inbufs:
 free_device:
 	device_destroy(pdrvdata.class, port->dev->devt);
 free_cdev:
-	cdev_del(&port->cdev);
+	cdev_del(port->cdev);
 free_port:
 	kfree(port);
 fail:
@@ -1113,21 +1227,45 @@ fail:
 	return err;
 }
 
-/* Remove all port-specific data. */
-static int remove_port(struct port *port)
+/* No users remain, remove all port-specific data. */
+static void remove_port(struct kref *kref)
+{
+	struct port *port;
+
+	port = container_of(kref, struct port, kref);
+
+	sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
+	device_destroy(pdrvdata.class, port->dev->devt);
+	cdev_del(port->cdev);
+
+	kfree(port->name);
+
+	debugfs_remove(port->debugfs_file);
+
+	kfree(port);
+}
+
+/*
+ * Port got unplugged.  Remove port from portdev's list and drop the
+ * kref reference.  If no userspace has this port opened, it will
+ * result in immediate removal the port.
+ */
+static void unplug_port(struct port *port)
 {
 	struct port_buffer *buf;
 
+	spin_lock_irq(&port->portdev->ports_lock);
+	list_del(&port->list);
+	spin_unlock_irq(&port->portdev->ports_lock);
+
 	if (port->guest_connected) {
 		port->guest_connected = false;
 		port->host_connected = false;
 		wake_up_interruptible(&port->waitqueue);
-		send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0);
-	}
 
-	spin_lock_irq(&port->portdev->ports_lock);
-	list_del(&port->list);
-	spin_unlock_irq(&port->portdev->ports_lock);
+		/* Let the app know the port is going down. */
+		send_sigio_to_port(port);
+	}
 
 	if (is_console_port(port)) {
 		spin_lock_irq(&pdrvdata_lock);
@@ -1146,9 +1284,6 @@ static int remove_port(struct port *port)
 		hvc_remove(port->cons.hvc);
 #endif
 	}
-	sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
-	device_destroy(pdrvdata.class, port->dev->devt);
-	cdev_del(&port->cdev);
 
 	/* Remove unused data this port might have received. */
 	discard_port_data(port);
@@ -1159,12 +1294,19 @@ static int remove_port(struct port *port)
 	while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
 		free_buf(buf);
 
-	kfree(port->name);
-
-	debugfs_remove(port->debugfs_file);
+	/*
+	 * We should just assume the device itself has gone off --
+	 * else a close on an open port later will try to send out a
+	 * control message.
+	 */
+	port->portdev = NULL;
 
-	kfree(port);
-	return 0;
+	/*
+	 * Locks around here are not necessary - a port can't be
+	 * opened after we removed the port struct from ports_list
+	 * above.
+	 */
+	kref_put(&port->kref, remove_port);
 }
 
 /* Any private messages that the Host and Guest want to share */
@@ -1203,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev,
 		add_port(portdev, cpkt->id);
 		break;
 	case VIRTIO_CONSOLE_PORT_REMOVE:
-		remove_port(port);
+		unplug_port(port);
 		break;
 	case VIRTIO_CONSOLE_CONSOLE_PORT:
 		if (!cpkt->value)
@@ -1245,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev,
 		spin_lock_irq(&port->outvq_lock);
 		reclaim_consumed_buffers(port);
 		spin_unlock_irq(&port->outvq_lock);
+
+		/*
+		 * If the guest is connected, it'll be interested in
+		 * knowing the host connection state changed.
+		 */
+		send_sigio_to_port(port);
 		break;
 	case VIRTIO_CONSOLE_PORT_NAME:
 		/*
@@ -1341,6 +1489,9 @@ static void in_intr(struct virtqueue *vq)
 
 	wake_up_interruptible(&port->waitqueue);
 
+	/* Send a SIGIO indicating new data in case the process asked for it */
+	send_sigio_to_port(port);
+
 	if (is_console_port(port) && hvc_poll(port->cons.hvc))
 		hvc_kick();
 }
@@ -1577,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
 		add_port(portdev, 0);
 	}
 
+	spin_lock_irq(&pdrvdata_lock);
+	list_add_tail(&portdev->list, &pdrvdata.portdevs);
+	spin_unlock_irq(&pdrvdata_lock);
+
 	__send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
 			   VIRTIO_CONSOLE_DEVICE_READY, 1);
 	return 0;
@@ -1600,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev)
 {
 	struct ports_device *portdev;
 	struct port *port, *port2;
-	struct port_buffer *buf;
-	unsigned int len;
 
 	portdev = vdev->priv;
 
+	spin_lock_irq(&pdrvdata_lock);
+	list_del(&portdev->list);
+	spin_unlock_irq(&pdrvdata_lock);
+
+	/* Disable interrupts for vqs */
+	vdev->config->reset(vdev);
+	/* Finish up work that's lined up */
 	cancel_work_sync(&portdev->control_work);
 
 	list_for_each_entry_safe(port, port2, &portdev->ports, list)
-		remove_port(port);
+		unplug_port(port);
 
 	unregister_chrdev(portdev->chr_major, "virtio-portsdev");
 
-	while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
-		free_buf(buf);
+	/*
+	 * When yanking out a device, we immediately lose the
+	 * (device-side) queues.  So there's no point in keeping the
+	 * guest side around till we drop our final reference.  This
+	 * also means that any ports which are in an open state will
+	 * have to just stop using the port, as the vqs are going
+	 * away.
+	 */
+	if (use_multiport(portdev)) {
+		struct port_buffer *buf;
+		unsigned int len;
 
-	while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
-		free_buf(buf);
+		while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
+			free_buf(buf);
+
+		while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
+			free_buf(buf);
+	}
 
 	vdev->config->del_vqs(vdev);
 	kfree(portdev->in_vqs);
@@ -1663,6 +1836,7 @@ static int __init init(void)
 			   PTR_ERR(pdrvdata.debugfs_dir));
 	}
 	INIT_LIST_HEAD(&pdrvdata.consoles);
+	INIT_LIST_HEAD(&pdrvdata.portdevs);
 
 	return register_virtio_driver(&virtio_console);
 }
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 3bb3a671baa..9dbb28b9559 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -76,7 +76,7 @@ config EDAC_MCE
 
 config EDAC_AMD64
 	tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
-	depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && EDAC_DECODE_MCE
+	depends on EDAC_MM_EDAC && AMD_NB && X86_64 && PCI && EDAC_DECODE_MCE
 	help
 	  Support for error detection and correction on the AMD 64
 	  Families of Memory Controllers (K8, F10h and F11h)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 76f7cc0ee14..8521401bbd7 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1,5 +1,5 @@
 #include "amd64_edac.h"
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 
 static struct edac_pci_ctl_info *amd64_ctl_pci;
 
@@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void)
 	 * to finish initialization of the MC instances.
 	 */
 	err = -ENODEV;
-	for (nb = 0; nb < num_k8_northbridges; nb++) {
+	for (nb = 0; nb < k8_northbridges.num; nb++) {
 		if (!pvt_lookup[nb])
 			continue;
 
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 280c9b5ad9e..88a3ae6cd02 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -125,7 +125,7 @@ config ISCSI_IBFT_FIND
 config ISCSI_IBFT
 	tristate "iSCSI Boot Firmware Table Attributes module"
 	select ISCSI_BOOT_SYSFS
-	depends on ISCSI_IBFT_FIND && SCSI
+	depends on ISCSI_IBFT_FIND && SCSI && SCSI_LOWLEVEL
 	default	n
 	help
 	  This option enables support for detection and exposing of iSCSI
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c37ef64d146..cb3ccf3ed22 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -59,18 +59,11 @@
 #include <linux/hrtimer.h>	/* ktime_get_real() */
 #include <trace/events/power.h>
 #include <linux/sched.h>
+#include <asm/mwait.h>
 
 #define INTEL_IDLE_VERSION "0.4"
 #define PREFIX "intel_idle: "
 
-#define MWAIT_SUBSTATE_MASK	(0xf)
-#define MWAIT_CSTATE_MASK	(0xf)
-#define MWAIT_SUBSTATE_SIZE	(4)
-#define MWAIT_MAX_NUM_CSTATES	8
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK	(0x2)
-
 static struct cpuidle_driver intel_idle_driver = {
 	.name = "intel_idle",
 	.owner = THIS_MODULE,
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 9ddafc30f43..af9ee313c10 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -28,7 +28,7 @@ struct evdev {
 	int minor;
 	struct input_handle handle;
 	wait_queue_head_t wait;
-	struct evdev_client *grab;
+	struct evdev_client __rcu *grab;
 	struct list_head client_list;
 	spinlock_t client_lock; /* protects client_list */
 	struct mutex mutex;
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index c1906647905..7e2c12a5b83 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm)
 	t.endidx =		91;
 	t.seq =			tseq;
 	t.act.semaphore =	&tsem;
-	init_MUTEX_LOCKED(&tsem);
+	sema_init(&tsem, 0);
 	
 	if (hp_sdc_enqueue_transaction(&t)) return -1;
 	
@@ -698,7 +698,7 @@ static int __init hp_sdc_rtc_init(void)
 		return -ENODEV;
 #endif
 
-	init_MUTEX(&i8042tregs);
+	sema_init(&i8042tregs, 1);
 
 	if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr)))
 		return ret;
diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c
index c92f4edfee7..e5624d8f170 100644
--- a/drivers/input/serio/hil_mlc.c
+++ b/drivers/input/serio/hil_mlc.c
@@ -915,15 +915,15 @@ int hil_mlc_register(hil_mlc *mlc)
 	mlc->ostarted = 0;
 
 	rwlock_init(&mlc->lock);
-	init_MUTEX(&mlc->osem);
+	sema_init(&mlc->osem, 1);
 
-	init_MUTEX(&mlc->isem);
+	sema_init(&mlc->isem, 1);
 	mlc->icount = -1;
 	mlc->imatch = 0;
 
 	mlc->opercnt = 0;
 
-	init_MUTEX_LOCKED(&(mlc->csem));
+	sema_init(&(mlc->csem), 0);
 
 	hil_mlc_clear_di_scratch(mlc);
 	hil_mlc_clear_di_map(mlc, 0);
diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c
index bcc2d30ec24..8c0b51c3142 100644
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -905,7 +905,7 @@ static int __init hp_sdc_init(void)
 	ts_sync[1]	= 0x0f;
 	ts_sync[2] = ts_sync[3]	= ts_sync[4] = ts_sync[5] = 0;
 	t_sync.act.semaphore = &s_sync;
-	init_MUTEX_LOCKED(&s_sync);
+	sema_init(&s_sync, 0);
 	hp_sdc_enqueue_transaction(&t_sync);
 	down(&s_sync); /* Wait for t_sync to complete */
 
@@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void)
 		return hp_sdc.dev_err;
 	}
 
-	init_MUTEX_LOCKED(&tq_init_sem);
+	sema_init(&tq_init_sem, 0);
 
 	tq_init.actidx		= 0;
 	tq_init.idx		= 1;
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 1c4ee6e7793..bf64e49d996 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -83,7 +83,7 @@ static struct adb_driver *adb_controller;
 BLOCKING_NOTIFIER_HEAD(adb_client_list);
 static int adb_got_sleep;
 static int adb_inited;
-static DECLARE_MUTEX(adb_probe_mutex);
+static DEFINE_SEMAPHORE(adb_probe_mutex);
 static int sleepy_trackpad;
 static int autopoll_devs;
 int __adb_probe_sync;
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index 70705d1306b..eca55c52bdf 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot)
 	lp->tx_len 		= lp->exec_box->data[9];   /* Transmit list count */
 	lp->rx_len 		= lp->exec_box->data[11];  /* Receive list count */
 
-	init_MUTEX_LOCKED(&lp->cmd_mutex);
+	sema_init(&lp->cmd_mutex, 0);
 	init_completion(&lp->execution_cmd);
 	init_completion(&lp->xceiver_cmd);
 
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 4b52c767ad0..3e5d0b6b651 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -608,7 +608,7 @@ static int sixpack_open(struct tty_struct *tty)
 
 	spin_lock_init(&sp->lock);
 	atomic_set(&sp->refcnt, 1);
-	init_MUTEX_LOCKED(&sp->dead_sem);
+	sema_init(&sp->dead_sem, 0);
 
 	/* !!! length of the buffers. MTU is IP MTU, not PACLEN!  */
 
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 66e88bd59ca..4c628393c8b 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -747,7 +747,7 @@ static int mkiss_open(struct tty_struct *tty)
 
 	spin_lock_init(&ax->buflock);
 	atomic_set(&ax->refcnt, 1);
-	init_MUTEX_LOCKED(&ax->dead_sem);
+	sema_init(&ax->dead_sem, 0);
 
 	ax->tty = tty;
 	tty->disc_data = ax;
diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c
index 1b051dab7b2..51d74447f8f 100644
--- a/drivers/net/irda/sir_dev.c
+++ b/drivers/net/irda/sir_dev.c
@@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n
 	dev->tx_skb = NULL;
 
 	spin_lock_init(&dev->tx_lock);
-	init_MUTEX(&dev->fsm.sem);
+	sema_init(&dev->fsm.sem, 1);
 
 	dev->drv = drv;
 	dev->netdev = ndev;
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index af50a530dae..78d70a6481b 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -184,7 +184,7 @@ ppp_asynctty_open(struct tty_struct *tty)
 	tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap);
 
 	atomic_set(&ap->refcnt, 1);
-	init_MUTEX_LOCKED(&ap->dead_sem);
+	sema_init(&ap->dead_sem, 0);
 
 	ap->chan.private = ap;
 	ap->chan.ops = &async_ops;
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index 04c6cd4333f..10bafd59f9c 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma)
 
 		/* Initialize the chardev data structures */
 		mutex_init(&chan->rlock);
-		init_MUTEX(&chan->wsem);
+		sema_init(&chan->wsem, 1);
 
 		/* Register the network interface */
 		if (!(chan->netdev = alloc_hdlcdev(chan))) {
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index b336cd9ee7a..f9bda64fcd1 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -225,26 +225,17 @@ post_sync:
 	mutex_unlock(&start_mutex);
 }
 
-int oprofile_set_backtrace(unsigned long val)
+int oprofile_set_ulong(unsigned long *addr, unsigned long val)
 {
-	int err = 0;
+	int err = -EBUSY;
 
 	mutex_lock(&start_mutex);
-
-	if (oprofile_started) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	if (!oprofile_ops.backtrace) {
-		err = -EINVAL;
-		goto out;
+	if (!oprofile_started) {
+		*addr = val;
+		err = 0;
 	}
-
-	oprofile_backtrace_depth = val;
-
-out:
 	mutex_unlock(&start_mutex);
+
 	return err;
 }
 
@@ -257,16 +248,9 @@ static int __init oprofile_init(void)
 		printk(KERN_INFO "oprofile: using timer interrupt.\n");
 		err = oprofile_timer_init(&oprofile_ops);
 		if (err)
-			goto out_arch;
+			return err;
 	}
-	err = oprofilefs_register();
-	if (err)
-		goto out_arch;
-	return 0;
-
-out_arch:
-	oprofile_arch_exit();
-	return err;
+	return oprofilefs_register();
 }
 
 
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index 47e12cb4ee8..177b73de5e5 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -37,7 +37,7 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root);
 int oprofile_timer_init(struct oprofile_operations *ops);
 void oprofile_timer_exit(void);
 
-int oprofile_set_backtrace(unsigned long depth);
+int oprofile_set_ulong(unsigned long *addr, unsigned long val);
 int oprofile_set_timeout(unsigned long time);
 
 #endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
index bbd7516e086..ccf099e684a 100644
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -79,14 +79,17 @@ static ssize_t depth_write(struct file *file, char const __user *buf, size_t cou
 	if (*offset)
 		return -EINVAL;
 
+	if (!oprofile_ops.backtrace)
+		return -EINVAL;
+
 	retval = oprofilefs_ulong_from_user(&val, buf, count);
 	if (retval)
 		return retval;
 
-	retval = oprofile_set_backtrace(val);
-
+	retval = oprofile_set_ulong(&oprofile_backtrace_depth, val);
 	if (retval)
 		return retval;
+
 	return count;
 }
 
diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c
new file mode 100644
index 00000000000..9046f7b2ed7
--- /dev/null
+++ b/drivers/oprofile/oprofile_perf.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2010 ARM Ltd.
+ *
+ * Perf-events backend for OProfile.
+ */
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/oprofile.h>
+#include <linux/slab.h>
+
+/*
+ * Per performance monitor configuration as set via oprofilefs.
+ */
+struct op_counter_config {
+	unsigned long count;
+	unsigned long enabled;
+	unsigned long event;
+	unsigned long unit_mask;
+	unsigned long kernel;
+	unsigned long user;
+	struct perf_event_attr attr;
+};
+
+static int oprofile_perf_enabled;
+static DEFINE_MUTEX(oprofile_perf_mutex);
+
+static struct op_counter_config *counter_config;
+static struct perf_event **perf_events[nr_cpumask_bits];
+static int num_counters;
+
+/*
+ * Overflow callback for oprofile.
+ */
+static void op_overflow_handler(struct perf_event *event, int unused,
+			struct perf_sample_data *data, struct pt_regs *regs)
+{
+	int id;
+	u32 cpu = smp_processor_id();
+
+	for (id = 0; id < num_counters; ++id)
+		if (perf_events[cpu][id] == event)
+			break;
+
+	if (id != num_counters)
+		oprofile_add_sample(regs, id);
+	else
+		pr_warning("oprofile: ignoring spurious overflow "
+				"on cpu %u\n", cpu);
+}
+
+/*
+ * Called by oprofile_perf_setup to create perf attributes to mirror the oprofile
+ * settings in counter_config. Attributes are created as `pinned' events and
+ * so are permanently scheduled on the PMU.
+ */
+static void op_perf_setup(void)
+{
+	int i;
+	u32 size = sizeof(struct perf_event_attr);
+	struct perf_event_attr *attr;
+
+	for (i = 0; i < num_counters; ++i) {
+		attr = &counter_config[i].attr;
+		memset(attr, 0, size);
+		attr->type		= PERF_TYPE_RAW;
+		attr->size		= size;
+		attr->config		= counter_config[i].event;
+		attr->sample_period	= counter_config[i].count;
+		attr->pinned		= 1;
+	}
+}
+
+static int op_create_counter(int cpu, int event)
+{
+	struct perf_event *pevent;
+
+	if (!counter_config[event].enabled || perf_events[cpu][event])
+		return 0;
+
+	pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
+						  cpu, NULL,
+						  op_overflow_handler);
+
+	if (IS_ERR(pevent))
+		return PTR_ERR(pevent);
+
+	if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
+		perf_event_release_kernel(pevent);
+		pr_warning("oprofile: failed to enable event %d "
+				"on CPU %d\n", event, cpu);
+		return -EBUSY;
+	}
+
+	perf_events[cpu][event] = pevent;
+
+	return 0;
+}
+
+static void op_destroy_counter(int cpu, int event)
+{
+	struct perf_event *pevent = perf_events[cpu][event];
+
+	if (pevent) {
+		perf_event_release_kernel(pevent);
+		perf_events[cpu][event] = NULL;
+	}
+}
+
+/*
+ * Called by oprofile_perf_start to create active perf events based on the
+ * perviously configured attributes.
+ */
+static int op_perf_start(void)
+{
+	int cpu, event, ret = 0;
+
+	for_each_online_cpu(cpu) {
+		for (event = 0; event < num_counters; ++event) {
+			ret = op_create_counter(cpu, event);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Called by oprofile_perf_stop at the end of a profiling run.
+ */
+static void op_perf_stop(void)
+{
+	int cpu, event;
+
+	for_each_online_cpu(cpu)
+		for (event = 0; event < num_counters; ++event)
+			op_destroy_counter(cpu, event);
+}
+
+static int oprofile_perf_create_files(struct super_block *sb, struct dentry *root)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_counters; i++) {
+		struct dentry *dir;
+		char buf[4];
+
+		snprintf(buf, sizeof buf, "%d", i);
+		dir = oprofilefs_mkdir(sb, root, buf);
+		oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
+		oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
+		oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
+		oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
+		oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
+		oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
+	}
+
+	return 0;
+}
+
+static int oprofile_perf_setup(void)
+{
+	spin_lock(&oprofilefs_lock);
+	op_perf_setup();
+	spin_unlock(&oprofilefs_lock);
+	return 0;
+}
+
+static int oprofile_perf_start(void)
+{
+	int ret = -EBUSY;
+
+	mutex_lock(&oprofile_perf_mutex);
+	if (!oprofile_perf_enabled) {
+		ret = 0;
+		op_perf_start();
+		oprofile_perf_enabled = 1;
+	}
+	mutex_unlock(&oprofile_perf_mutex);
+	return ret;
+}
+
+static void oprofile_perf_stop(void)
+{
+	mutex_lock(&oprofile_perf_mutex);
+	if (oprofile_perf_enabled)
+		op_perf_stop();
+	oprofile_perf_enabled = 0;
+	mutex_unlock(&oprofile_perf_mutex);
+}
+
+#ifdef CONFIG_PM
+
+static int oprofile_perf_suspend(struct platform_device *dev, pm_message_t state)
+{
+	mutex_lock(&oprofile_perf_mutex);
+	if (oprofile_perf_enabled)
+		op_perf_stop();
+	mutex_unlock(&oprofile_perf_mutex);
+	return 0;
+}
+
+static int oprofile_perf_resume(struct platform_device *dev)
+{
+	mutex_lock(&oprofile_perf_mutex);
+	if (oprofile_perf_enabled && op_perf_start())
+		oprofile_perf_enabled = 0;
+	mutex_unlock(&oprofile_perf_mutex);
+	return 0;
+}
+
+static struct platform_driver oprofile_driver = {
+	.driver		= {
+		.name		= "oprofile-perf",
+	},
+	.resume		= oprofile_perf_resume,
+	.suspend	= oprofile_perf_suspend,
+};
+
+static struct platform_device *oprofile_pdev;
+
+static int __init init_driverfs(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&oprofile_driver);
+	if (ret)
+		return ret;
+
+	oprofile_pdev =	platform_device_register_simple(
+				oprofile_driver.driver.name, 0, NULL, 0);
+	if (IS_ERR(oprofile_pdev)) {
+		ret = PTR_ERR(oprofile_pdev);
+		platform_driver_unregister(&oprofile_driver);
+	}
+
+	return ret;
+}
+
+static void exit_driverfs(void)
+{
+	platform_device_unregister(oprofile_pdev);
+	platform_driver_unregister(&oprofile_driver);
+}
+
+#else
+
+static inline int  init_driverfs(void) { return 0; }
+static inline void exit_driverfs(void) { }
+
+#endif /* CONFIG_PM */
+
+void oprofile_perf_exit(void)
+{
+	int cpu, id;
+	struct perf_event *event;
+
+	for_each_possible_cpu(cpu) {
+		for (id = 0; id < num_counters; ++id) {
+			event = perf_events[cpu][id];
+			if (event)
+				perf_event_release_kernel(event);
+		}
+
+		kfree(perf_events[cpu]);
+	}
+
+	kfree(counter_config);
+	exit_driverfs();
+}
+
+int __init oprofile_perf_init(struct oprofile_operations *ops)
+{
+	int cpu, ret = 0;
+
+	ret = init_driverfs();
+	if (ret)
+		return ret;
+
+	memset(&perf_events, 0, sizeof(perf_events));
+
+	num_counters = perf_num_counters();
+	if (num_counters <= 0) {
+		pr_info("oprofile: no performance counters\n");
+		ret = -ENODEV;
+		goto out;
+	}
+
+	counter_config = kcalloc(num_counters,
+			sizeof(struct op_counter_config), GFP_KERNEL);
+
+	if (!counter_config) {
+		pr_info("oprofile: failed to allocate %d "
+				"counters\n", num_counters);
+		ret = -ENOMEM;
+		num_counters = 0;
+		goto out;
+	}
+
+	for_each_possible_cpu(cpu) {
+		perf_events[cpu] = kcalloc(num_counters,
+				sizeof(struct perf_event *), GFP_KERNEL);
+		if (!perf_events[cpu]) {
+			pr_info("oprofile: failed to allocate %d perf events "
+					"for cpu %d\n", num_counters, cpu);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	ops->create_files	= oprofile_perf_create_files;
+	ops->setup		= oprofile_perf_setup;
+	ops->start		= oprofile_perf_start;
+	ops->stop		= oprofile_perf_stop;
+	ops->shutdown		= oprofile_perf_stop;
+	ops->cpu_type		= op_name_from_perf_id();
+
+	if (!ops->cpu_type)
+		ret = -ENODEV;
+	else
+		pr_info("oprofile: using %s\n", ops->cpu_type);
+
+out:
+	if (ret)
+		oprofile_perf_exit();
+
+	return ret;
+}
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 2766a6d3c2e..1944621930d 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -91,16 +91,20 @@ static ssize_t ulong_read_file(struct file *file, char __user *buf, size_t count
 
 static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_t count, loff_t *offset)
 {
-	unsigned long *value = file->private_data;
+	unsigned long value;
 	int retval;
 
 	if (*offset)
 		return -EINVAL;
 
-	retval = oprofilefs_ulong_from_user(value, buf, count);
+	retval = oprofilefs_ulong_from_user(&value, buf, count);
+	if (retval)
+		return retval;
 
+	retval = oprofile_set_ulong(file->private_data, value);
 	if (retval)
 		return retval;
+
 	return count;
 }
 
@@ -126,50 +130,41 @@ static const struct file_operations ulong_ro_fops = {
 };
 
 
-static struct dentry *__oprofilefs_create_file(struct super_block *sb,
+static int __oprofilefs_create_file(struct super_block *sb,
 	struct dentry *root, char const *name, const struct file_operations *fops,
-	int perm)
+	int perm, void *priv)
 {
 	struct dentry *dentry;
 	struct inode *inode;
 
 	dentry = d_alloc_name(root, name);
 	if (!dentry)
-		return NULL;
+		return -ENOMEM;
 	inode = oprofilefs_get_inode(sb, S_IFREG | perm);
 	if (!inode) {
 		dput(dentry);
-		return NULL;
+		return -ENOMEM;
 	}
 	inode->i_fop = fops;
 	d_add(dentry, inode);
-	return dentry;
+	dentry->d_inode->i_private = priv;
+	return 0;
 }
 
 
 int oprofilefs_create_ulong(struct super_block *sb, struct dentry *root,
 	char const *name, unsigned long *val)
 {
-	struct dentry *d = __oprofilefs_create_file(sb, root, name,
-						     &ulong_fops, 0644);
-	if (!d)
-		return -EFAULT;
-
-	d->d_inode->i_private = val;
-	return 0;
+	return __oprofilefs_create_file(sb, root, name,
+					&ulong_fops, 0644, val);
 }
 
 
 int oprofilefs_create_ro_ulong(struct super_block *sb, struct dentry *root,
 	char const *name, unsigned long *val)
 {
-	struct dentry *d = __oprofilefs_create_file(sb, root, name,
-						     &ulong_ro_fops, 0444);
-	if (!d)
-		return -EFAULT;
-
-	d->d_inode->i_private = val;
-	return 0;
+	return __oprofilefs_create_file(sb, root, name,
+					&ulong_ro_fops, 0444, val);
 }
 
 
@@ -189,31 +184,22 @@ static const struct file_operations atomic_ro_fops = {
 int oprofilefs_create_ro_atomic(struct super_block *sb, struct dentry *root,
 	char const *name, atomic_t *val)
 {
-	struct dentry *d = __oprofilefs_create_file(sb, root, name,
-						     &atomic_ro_fops, 0444);
-	if (!d)
-		return -EFAULT;
-
-	d->d_inode->i_private = val;
-	return 0;
+	return __oprofilefs_create_file(sb, root, name,
+					&atomic_ro_fops, 0444, val);
 }
 
 
 int oprofilefs_create_file(struct super_block *sb, struct dentry *root,
 	char const *name, const struct file_operations *fops)
 {
-	if (!__oprofilefs_create_file(sb, root, name, fops, 0644))
-		return -EFAULT;
-	return 0;
+	return __oprofilefs_create_file(sb, root, name, fops, 0644, NULL);
 }
 
 
 int oprofilefs_create_file_perm(struct super_block *sb, struct dentry *root,
 	char const *name, const struct file_operations *fops, int perm)
 {
-	if (!__oprofilefs_create_file(sb, root, name, fops, perm))
-		return -EFAULT;
-	return 0;
+	return __oprofilefs_create_file(sb, root, name, fops, perm, NULL);
 }
 
 
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index dffa5d4fb29..a2d9d1e5926 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
 	spin_lock_init(&tmp->pardevice_lock);
 	tmp->ieee1284.mode = IEEE1284_MODE_COMPAT;
 	tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
-	init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */
+	sema_init(&tmp->ieee1284.irq, 0);
 	tmp->spintime = parport_default_spintime;
 	atomic_set (&tmp->ref_count, 1);
 	INIT_LIST_HEAD(&tmp->full_list);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7c8008225ee..17927b1f933 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net)
 	size_t len, total_len = 0;
 	int err, wmem;
 	size_t hdr_size;
-	struct socket *sock = rcu_dereference(vq->private_data);
+	struct socket *sock;
+
+	sock = rcu_dereference_check(vq->private_data,
+				     lockdep_is_held(&vq->mutex));
 	if (!sock)
 		return;
 
@@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n,
 static void vhost_net_enable_vq(struct vhost_net *n,
 				struct vhost_virtqueue *vq)
 {
-	struct socket *sock = vq->private_data;
+	struct socket *sock;
+
+	sock = rcu_dereference_protected(vq->private_data,
+					 lockdep_is_held(&vq->mutex));
 	if (!sock)
 		return;
 	if (vq == n->vqs + VHOST_NET_VQ_TX) {
@@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
 	struct socket *sock;
 
 	mutex_lock(&vq->mutex);
-	sock = vq->private_data;
+	sock = rcu_dereference_protected(vq->private_data,
+					 lockdep_is_held(&vq->mutex));
 	vhost_net_disable_vq(n, vq);
 	rcu_assign_pointer(vq->private_data, NULL);
 	mutex_unlock(&vq->mutex);
@@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	}
 
 	/* start polling new socket */
-	oldsock = vq->private_data;
+	oldsock = rcu_dereference_protected(vq->private_data,
+					    lockdep_is_held(&vq->mutex));
 	if (sock != oldsock) {
                 vhost_net_disable_vq(n, vq);
                 rcu_assign_pointer(vq->private_data, sock);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dd3d6f7406f..8b5a1b33d0f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	vhost_dev_cleanup(dev);
 
 	memory->nregions = 0;
-	dev->memory = memory;
+	RCU_INIT_POINTER(dev->memory, memory);
 	return 0;
 }
 
@@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		fput(dev->log_file);
 	dev->log_file = NULL;
 	/* No one will access memory at this point */
-	kfree(dev->memory);
-	dev->memory = NULL;
+	kfree(rcu_dereference_protected(dev->memory,
+					lockdep_is_held(&dev->mutex)));
+	RCU_INIT_POINTER(dev->memory, NULL);
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
@@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num,
 /* Caller should have device mutex but not vq mutex */
 int vhost_log_access_ok(struct vhost_dev *dev)
 {
-	return memory_access_ok(dev, dev->memory, 1);
+	struct vhost_memory *mp;
+
+	mp = rcu_dereference_protected(dev->memory,
+				       lockdep_is_held(&dev->mutex));
+	return memory_access_ok(dev, mp, 1);
 }
 
 /* Verify access for write logging. */
 /* Caller should have vq mutex and device mutex */
 static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
 {
-	return vq_memory_access_ok(log_base, vq->dev->memory,
+	struct vhost_memory *mp;
+
+	mp = rcu_dereference_protected(vq->dev->memory,
+				       lockdep_is_held(&vq->mutex));
+	return vq_memory_access_ok(log_base, mp,
 			    vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
 		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
 					sizeof *vq->used +
@@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 		kfree(newmem);
 		return -EFAULT;
 	}
-	oldmem = d->memory;
+	oldmem = rcu_dereference_protected(d->memory,
+					   lockdep_is_held(&d->mutex));
 	rcu_assign_pointer(d->memory, newmem);
 	synchronize_rcu();
 	kfree(oldmem);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index afd77295971..af3c11ded5f 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
 	 * vhost_work execution acts instead of rcu_read_lock() and the end of
 	 * vhost_work execution acts instead of rcu_read_lock().
 	 * Writers use virtqueue mutex. */
-	void *private_data;
+	void __rcu *private_data;
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log log[VHOST_NET_MAX_SG];
@@ -116,7 +116,7 @@ struct vhost_dev {
 	/* Readers use RCU to access memory table pointer
 	 * log base pointer and features.
 	 * Writers use mutex below.*/
-	struct vhost_memory *memory;
+	struct vhost_memory __rcu *memory;
 	struct mm_struct *mm;
 	struct mutex mutex;
 	unsigned acked_features;
@@ -173,7 +173,11 @@ enum {
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
 {
-	unsigned acked_features = rcu_dereference(dev->acked_features);
+	unsigned acked_features;
+
+	acked_features =
+		rcu_dereference_index_check(dev->acked_features,
+					    lockdep_is_held(&dev->mutex));
 	return acked_features & (1 << bit);
 }
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 33c4e7eef47..9581ea94d5a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -109,8 +109,8 @@ static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
-	init_MUTEX(&ei->i_link_lock);
-	init_MUTEX(&ei->i_ext_lock);
+	sema_init(&ei->i_link_lock, 1);
+	sema_init(&ei->i_ext_lock, 1);
 	inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23..9eb134ea6eb 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
 config CEPH_FS
         tristate "Ceph distributed file system (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
+	select CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
 	select CRYPTO
+	default n
 	help
 	  Choose Y or M here to include support for mounting the
 	  experimental Ceph distributed file system.  Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
 
 	  If unsure, say N.
 
-config CEPH_FS_PRETTYDEBUG
-	bool "Include file:line in ceph debug output"
-	depends on CEPH_FS
-	default n
-	help
-	  If you say Y here, debug output will include a filename and
-	  line to aid debugging.  This icnreases kernel size and slows
-	  execution slightly when debug call sites are enabled (e.g.,
-	  via CONFIG_DYNAMIC_DEBUG).
-
-	  If unsure, say N.
-
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600..9e6c4f2e8ff 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
 
 ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 	export.o caps.o snap.o xattr.o \
-	messenger.o msgpool.o buffer.o pagelist.o \
-	mds_client.o mdsmap.o \
-	mon_client.o \
-	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
-	debugfs.o \
-	auth.o auth_none.o \
-	crypto.o armor.o \
-	auth_x.o \
-	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+	mds_client.o mdsmap.o strings.o ceph_frag.o \
+	debugfs.o
 
 else
 #Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c..00000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland                  kernel
-src/include/ceph_fs.h	    fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc	    fs/ceph/ceph_fs.c
-src/include/msgr.h	    fs/ceph/msgr.h
-src/include/rados.h	    fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
-src/include/ceph_hash.h	    fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
-src/crush/crush.c	    fs/ceph/crush/crush.c
-src/crush/crush.h	    fs/ceph/crush/crush.h
-src/crush/mapper.c	    fs/ceph/crush/mapper.c
-src/crush/mapper.h	    fs/ceph/crush/mapper.h
-src/crush/hash.h	    fs/ceph/crush/hash.h
-src/crush/hash.c	    fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c..51bcc5ce323 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
@@ -10,7 +10,8 @@
 #include <linux/task_io_accounting_ops.h>
 
 #include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
 
 /*
  * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	struct ceph_osd_client *osdc = 
+		&ceph_inode_to_client(inode)->client->osdc;
 	int err = 0;
 	u64 len = PAGE_CACHE_SIZE;
 
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
 	int rc = 0;
 	struct page **pages;
 	loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	struct ceph_osd_client *osdc;
 	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
 	int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	}
 	inode = page->mapping->host;
 	ci = ceph_inode(inode);
-	client = ceph_inode_to_client(inode);
-	osdc = &client->osdc;
+	fsc = ceph_inode_to_client(inode);
+	osdc = &fsc->client->osdc;
 
 	/* verify this is a writeable snap context */
 	snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
 	     inode, page, page->index, page_off, len, snapc);
 
-	writeback_stat = atomic_long_inc_return(&client->writeback_count);
+	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
 	if (writeback_stat >
-	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
-		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
 
 	set_page_writeback(page);
 	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct address_space *mapping = inode->i_mapping;
 	__s32 rc = -EIO;
 	u64 bytes = 0;
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
 		WARN_ON(!PageUptodate(page));
 
 		writeback_stat =
-			atomic_long_dec_return(&client->writeback_count);
+			atomic_long_dec_return(&fsc->writeback_count);
 		if (writeback_stat <
-		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
-			clear_bdi_congested(&client->backing_dev_info,
+		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+			clear_bdi_congested(&fsc->backing_dev_info,
 					    BLK_RW_ASYNC);
 
 		ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
  * mempool.  we avoid the mempool if we can because req->r_num_pages
  * may be less than the maximum write size.
  */
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
 			   struct ceph_osd_request *req)
 {
 	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
 			       GFP_NOFS);
 	if (!req->r_pages) {
-		req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
 		req->r_pages_from_pool = 1;
 		WARN_ON(!req->r_pages);
 	}
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	pgoff_t index, start, end;
 	int range_whole = 0;
 	int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	client = ceph_inode_to_client(inode);
-	if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+	fsc = ceph_inode_to_client(inode);
+	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
 		pr_warning("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
-	if (client->mount_args->wsize && client->mount_args->wsize < wsize)
-		wsize = client->mount_args->wsize;
+	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+		wsize = fsc->mount_options->wsize;
 	if (wsize < PAGE_CACHE_SIZE)
 		wsize = PAGE_CACHE_SIZE;
 	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
 				offset = (unsigned long long)page->index
 					<< PAGE_CACHE_SHIFT;
 				len = wsize;
-				req = ceph_osdc_new_request(&client->osdc,
+				req = ceph_osdc_new_request(&fsc->client->osdc,
 					    &ci->i_layout,
 					    ceph_vino(inode),
 					    offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
 					    &inode->i_mtime, true, 1);
 				max_pages = req->r_num_pages;
 
-				alloc_page_vec(client, req);
+				alloc_page_vec(fsc, req);
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
 			}
@@ -794,10 +797,10 @@ get_more_pages:
 			     inode, page, page->index);
 
 			writeback_stat =
-			       atomic_long_inc_return(&client->writeback_count);
+			       atomic_long_inc_return(&fsc->writeback_count);
 			if (writeback_stat > CONGESTION_ON_THRESH(
-				    client->mount_args->congestion_kb)) {
-				set_bdi_congested(&client->backing_dev_info,
+				    fsc->mount_options->congestion_kb)) {
+				set_bdi_congested(&fsc->backing_dev_info,
 						  BLK_RW_ASYNC);
 			}
 
@@ -846,7 +849,7 @@ get_more_pages:
 		op->payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
 
-		ceph_osdc_start_request(&client->osdc, req, true);
+		ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		req = NULL;
 
 		/* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t page_off = pos & PAGE_CACHE_MASK;
 	int pos_in_page = pos & ~PAGE_CACHE_MASK;
 	int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_client *client = ceph_inode_to_client(inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;
 
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page = vmf->page;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t off = page->index << PAGE_CACHE_SHIFT;
 	loff_t size, len;
 	int ret;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5e9da996a15..98ab13e2b71 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -9,8 +9,9 @@
 #include <linux/writeback.h>
 
 #include "super.h"
-#include "decode.h"
-#include "messenger.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
 
 /*
  * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
 			     int *total, int *avail, int *used, int *reserved,
 			     int *min)
 {
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 
 	if (total)
 		*total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
-	struct ceph_mount_args *ma = mdsc->client->mount_args;
+	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
 
 	ci->i_hold_caps_min = round_jiffies(jiffies +
 					    ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
 		 unsigned seq, unsigned mseq, u64 realmino, int flags,
 		 struct ceph_cap_reservation *caps_reservation)
 {
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap *new_cap = NULL;
 	struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	int removed = 0;
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
 	int mds;
 	struct ceph_cap_snap *capsnap;
 	u32 mseq;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
 						    session->s_mutex */
 	u64 next_follows = 0;  /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	int was = ci->i_dirty_caps;
 	int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
 				 struct ceph_mds_session *session)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int flushing;
 
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
 /*
  * try to invalidate mapping pages without blocking.
  */
-static int mapping_is_empty(struct address_space *mapping)
-{
-	struct page *page = find_get_page(mapping, 0);
-
-	if (!page)
-		return 1;
-
-	put_page(page);
-	return 0;
-}
-
 static int try_nonblocking_invalidate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
 	invalidate_mapping_pages(&inode->i_data, 0, -1);
 	spin_lock(&inode->i_lock);
 
-	if (mapping_is_empty(&inode->i_data) &&
+	if (inode->i_data.nrpages == 0 &&
 	    invalidating_gen == ci->i_rdcache_gen) {
 		/* success. */
 		dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		     struct ceph_mds_session *session)
 {
-	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
 	int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
 	 */
 	if ((!is_delayed || mdsc->stopping) &&
 	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-	    ci->i_rdcache_gen &&                     /* may have cached pages */
+	    inode->i_data.nrpages &&                 /* have cached pages */
 	    (file_wanted == 0 ||                     /* no open files */
 	     (revoking & (CEPH_CAP_FILE_CACHE|
 			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
@@ -1706,7 +1696,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
 			  unsigned *flush_tid)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int unlock_session = session ? 0 : 1;
 	int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 				       caps_are_flushed(inode, flush_tid));
 	} else {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(inode->i_sb)->mdsc;
+			ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&inode->i_lock);
 		if (__ceph_caps_dirty(ci))
@@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	__releases(inode->i_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	unsigned seq = le32_to_cpu(m->seq);
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
@@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c409..bdce8b1fbd0 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
 /*
  * Ceph 'frag' type
  */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 
 int ceph_frag_compare(__u32 a, __u32 b)
 {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a861..7ae1b3d55b5 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/device.h>
 #include <linux/slab.h>
@@ -7,143 +7,49 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
 #include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
 
 #ifdef CONFIG_DEBUG_FS
 
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
- *      .../osdmap      - current osdmap
- *      .../mdsmap      - current mdsmap
- *      .../monmap      - current monmap
- *      .../osdc        - active osd requests
- *      .../mdsc        - active mds requests
- *      .../monc        - mon client state
- *      .../dentry_lru  - dump contents of dentry lru
- *      .../caps        - expose cap (reservation) stats
- *      .../bdi         - symlink to ../../bdi/something
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
-	int i;
-	struct ceph_client *client = s->private;
-
-	if (client->monc.monmap == NULL)
-		return 0;
-
-	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-	for (i = 0; i < client->monc.monmap->num_mon; i++) {
-		struct ceph_entity_inst *inst =
-			&client->monc.monmap->mon_inst[i];
-
-		seq_printf(s, "\t%s%lld\t%s\n",
-			   ENTITY_NAME(inst->name),
-			   pr_addr(&inst->addr.in_addr));
-	}
-	return 0;
-}
+#include "mds_client.h"
 
 static int mdsmap_show(struct seq_file *s, void *p)
 {
 	int i;
-	struct ceph_client *client = s->private;
+	struct ceph_fs_client *fsc = s->private;
 
-	if (client->mdsc.mdsmap == NULL)
+	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
 		return 0;
-	seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
-	seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
 	seq_printf(s, "session_timeout %d\n",
-		       client->mdsc.mdsmap->m_session_timeout);
+		       fsc->mdsc->mdsmap->m_session_timeout);
 	seq_printf(s, "session_autoclose %d\n",
-		       client->mdsc.mdsmap->m_session_autoclose);
-	for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+		       fsc->mdsc->mdsmap->m_session_autoclose);
+	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
 		struct ceph_entity_addr *addr =
-			&client->mdsc.mdsmap->m_info[i].addr;
-		int state = client->mdsc.mdsmap->m_info[i].state;
+			&fsc->mdsc->mdsmap->m_info[i].addr;
+		int state = fsc->mdsc->mdsmap->m_info[i].state;
 
-		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+			       ceph_pr_addr(&addr->in_addr),
 			       ceph_mds_state_name(state));
 	}
 	return 0;
 }
 
-static int osdmap_show(struct seq_file *s, void *p)
-{
-	int i;
-	struct ceph_client *client = s->private;
-	struct rb_node *n;
-
-	if (client->osdc.osdmap == NULL)
-		return 0;
-	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
-	seq_printf(s, "flags%s%s\n",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-		   " NEARFULL" : "",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-		   " FULL" : "");
-	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
-		struct ceph_pg_pool_info *pool =
-			rb_entry(n, struct ceph_pg_pool_info, node);
-		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-			   pool->id, pool->v.pg_num, pool->pg_num_mask,
-			   pool->v.lpg_num, pool->lpg_num_mask);
-	}
-	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-		struct ceph_entity_addr *addr =
-			&client->osdc.osdmap->osd_addr[i];
-		int state = client->osdc.osdmap->osd_state[i];
-		char sb[64];
-
-		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
-			   i, pr_addr(&addr->in_addr),
-			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
-			   ceph_osdmap_state_str(sb, sizeof(sb), state));
-	}
-	return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
-	struct ceph_client *client = s->private;
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_client *monc = &client->monc;
-	struct rb_node *rp;
-
-	mutex_lock(&monc->mutex);
-
-	if (monc->have_mdsmap)
-		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
-	if (monc->have_osdmap)
-		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
-	if (monc->want_next_osdmap)
-		seq_printf(s, "want next osdmap\n");
-
-	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-		__u16 op;
-		req = rb_entry(rp, struct ceph_mon_generic_request, node);
-		op = le16_to_cpu(req->request->hdr.type);
-		if (op == CEPH_MSG_STATFS)
-			seq_printf(s, "%lld statfs\n", req->tid);
-		else
-			seq_printf(s, "%lld unknown\n", req->tid);
-	}
-
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
+/*
+ * mdsc debugfs
+ */
 static int mdsc_show(struct seq_file *s, void *p)
 {
-	struct ceph_client *client = s->private;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct rb_node *rp;
 	int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int osdc_show(struct seq_file *s, void *pp)
-{
-	struct ceph_client *client = s->private;
-	struct ceph_osd_client *osdc = &client->osdc;
-	struct rb_node *p;
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		struct ceph_osd_request *req;
-		struct ceph_osd_request_head *head;
-		struct ceph_osd_op *op;
-		int num_ops;
-		int opcode, olen;
-		int i;
-
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-
-		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1,
-			   le32_to_cpu(req->r_pgid.pool),
-			   le16_to_cpu(req->r_pgid.ps));
-
-		head = req->r_request->front.iov_base;
-		op = (void *)(head + 1);
-
-		num_ops = le16_to_cpu(head->num_ops);
-		olen = le32_to_cpu(head->object_len);
-		seq_printf(s, "%.*s", olen,
-			   (const char *)(head->ops + num_ops));
-
-		if (req->r_reassert_version.epoch)
-			seq_printf(s, "\t%u'%llu",
-			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
-			   le64_to_cpu(req->r_reassert_version.version));
-		else
-			seq_printf(s, "\t");
-
-		for (i = 0; i < num_ops; i++) {
-			opcode = le16_to_cpu(op->op);
-			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-			op++;
-		}
-
-		seq_printf(s, "\n");
-	}
-	mutex_unlock(&osdc->request_mutex);
-	return 0;
-}
-
 static int caps_show(struct seq_file *s, void *p)
 {
-	struct ceph_client *client = s->private;
+	struct ceph_fs_client *fsc = s->private;
 	int total, avail, used, reserved, min;
 
-	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+	ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
 	seq_printf(s, "total\t\t%d\n"
 		   "avail\t\t%d\n"
 		   "used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
 
 static int dentry_lru_show(struct seq_file *s, void *ptr)
 {
-	struct ceph_client *client = s->private;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_dentry_info *di;
 
 	spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
 	return 0;
 }
 
-#define DEFINE_SHOW_FUNC(name)						\
-static int name##_open(struct inode *inode, struct file *file)		\
-{									\
-	struct seq_file *sf;						\
-	int ret;							\
-									\
-	ret = single_open(file, name, NULL);				\
-	sf = file->private_data;					\
-	sf->private = inode->i_private;					\
-	return ret;							\
-}									\
-									\
-static const struct file_operations name##_fops = {			\
-	.open		= name##_open,					\
-	.read		= seq_read,					\
-	.llseek		= seq_lseek,					\
-	.release	= single_release,				\
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
 
+/*
+ * debugfs
+ */
 static int congestion_kb_set(void *data, u64 val)
 {
-	struct ceph_client *client = (struct ceph_client *)data;
-
-	if (client)
-		client->mount_args->congestion_kb = (int)val;
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
 
+	fsc->mount_options->congestion_kb = (int)val;
 	return 0;
 }
 
 static int congestion_kb_get(void *data, u64 *val)
 {
-	struct ceph_client *client = (struct ceph_client *)data;
-
-	if (client)
-		*val = (u64)client->mount_args->congestion_kb;
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
 
+	*val = (u64)fsc->mount_options->congestion_kb;
 	return 0;
 }
 
-
 DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
 			congestion_kb_set, "%llu\n");
 
-int __init ceph_debugfs_init(void)
-{
-	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-	if (!ceph_debugfs_dir)
-		return -ENOMEM;
-	return 0;
-}
 
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
-	debugfs_remove(ceph_debugfs_dir);
+	dout("ceph_fs_debugfs_cleanup\n");
+	debugfs_remove(fsc->debugfs_bdi);
+	debugfs_remove(fsc->debugfs_congestion_kb);
+	debugfs_remove(fsc->debugfs_mdsmap);
+	debugfs_remove(fsc->debugfs_caps);
+	debugfs_remove(fsc->debugfs_mdsc);
+	debugfs_remove(fsc->debugfs_dentry_lru);
 }
 
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-	int ret = 0;
-	char name[80];
-
-	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-		 client->monc.auth->global_id);
+	char name[100];
+	int err = -ENOMEM;
 
-	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-	if (!client->debugfs_dir)
-		goto out;
-
-	client->monc.debugfs_file = debugfs_create_file("monc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &monc_show_fops);
-	if (!client->monc.debugfs_file)
+	dout("ceph_fs_debugfs_init\n");
+	fsc->debugfs_congestion_kb =
+		debugfs_create_file("writeback_congestion_kb",
+				    0600,
+				    fsc->client->debugfs_dir,
+				    fsc,
+				    &congestion_kb_fops);
+	if (!fsc->debugfs_congestion_kb)
 		goto out;
 
-	client->mdsc.debugfs_file = debugfs_create_file("mdsc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &mdsc_show_fops);
-	if (!client->mdsc.debugfs_file)
-		goto out;
+	dout("a\n");
 
-	client->osdc.debugfs_file = debugfs_create_file("osdc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &osdc_show_fops);
-	if (!client->osdc.debugfs_file)
+	snprintf(name, sizeof(name), "../../bdi/%s",
+		 dev_name(fsc->backing_dev_info.dev));
+	fsc->debugfs_bdi =
+		debugfs_create_symlink("bdi",
+				       fsc->client->debugfs_dir,
+				       name);
+	if (!fsc->debugfs_bdi)
 		goto out;
 
-	client->debugfs_monmap = debugfs_create_file("monmap",
+	dout("b\n");
+	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
 					0600,
-					client->debugfs_dir,
-					client,
-					&monmap_show_fops);
-	if (!client->debugfs_monmap)
-		goto out;
-
-	client->debugfs_mdsmap = debugfs_create_file("mdsmap",
-					0600,
-					client->debugfs_dir,
-					client,
+					fsc->client->debugfs_dir,
+					fsc,
 					&mdsmap_show_fops);
-	if (!client->debugfs_mdsmap)
-		goto out;
-
-	client->debugfs_osdmap = debugfs_create_file("osdmap",
-					0600,
-					client->debugfs_dir,
-					client,
-					&osdmap_show_fops);
-	if (!client->debugfs_osdmap)
+	if (!fsc->debugfs_mdsmap)
 		goto out;
 
-	client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-					0600,
-					client->debugfs_dir,
-					client,
-					&dentry_lru_show_fops);
-	if (!client->debugfs_dentry_lru)
+	dout("ca\n");
+	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+						0600,
+						fsc->client->debugfs_dir,
+						fsc,
+						&mdsc_show_fops);
+	if (!fsc->debugfs_mdsc)
 		goto out;
 
-	client->debugfs_caps = debugfs_create_file("caps",
+	dout("da\n");
+	fsc->debugfs_caps = debugfs_create_file("caps",
 						   0400,
-						   client->debugfs_dir,
-						   client,
+						   fsc->client->debugfs_dir,
+						   fsc,
 						   &caps_show_fops);
-	if (!client->debugfs_caps)
+	if (!fsc->debugfs_caps)
 		goto out;
 
-	client->debugfs_congestion_kb =
-		debugfs_create_file("writeback_congestion_kb",
-				    0600,
-				    client->debugfs_dir,
-				    client,
-				    &congestion_kb_fops);
-	if (!client->debugfs_congestion_kb)
+	dout("ea\n");
+	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&dentry_lru_show_fops);
+	if (!fsc->debugfs_dentry_lru)
 		goto out;
 
-	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
-	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
-						     name);
-
 	return 0;
 
 out:
-	ceph_debugfs_client_cleanup(client);
-	return ret;
+	ceph_fs_debugfs_cleanup(fsc);
+	return err;
 }
 
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
-	debugfs_remove(client->debugfs_bdi);
-	debugfs_remove(client->debugfs_caps);
-	debugfs_remove(client->debugfs_dentry_lru);
-	debugfs_remove(client->debugfs_osdmap);
-	debugfs_remove(client->debugfs_mdsmap);
-	debugfs_remove(client->debugfs_monmap);
-	debugfs_remove(client->osdc.debugfs_file);
-	debugfs_remove(client->mdsc.debugfs_file);
-	debugfs_remove(client->monc.debugfs_file);
-	debugfs_remove(client->debugfs_congestion_kb);
-	debugfs_remove(client->debugfs_dir);
-}
 
 #else  /* CONFIG_DEBUG_FS */
 
-int __init ceph_debugfs_init(void)
-{
-	return 0;
-}
-
-void ceph_debugfs_cleanup(void)
-{
-}
-
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
 	return 0;
 }
 
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
 }
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb5204..e0a2dc6fcaf 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 
 #include "super.h"
+#include "mds_client.h"
 
 /*
  * Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
  */
 static int __dcache_readdir(struct file *filp,
 			    void *dirent, filldir_t filldir)
-		__releases(inode->i_lock)
-		__acquires(inode->i_lock)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_file_info *fi = filp->private_data;
 	struct dentry *parent = filp->f_dentry;
 	struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
 
 	atomic_inc(&dentry->d_count);
 	spin_unlock(&dcache_lock);
-	spin_unlock(&inode->i_lock);
 
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
 		} else {
 			dput(last);
 		}
-		last = NULL;
 	}
-
-	spin_lock(&inode->i_lock);
-	spin_lock(&dcache_lock);
-
 	last = dentry;
 
 	if (err < 0)
-		goto out_unlock;
+		goto out;
 
-	p = p->prev;
 	filp->f_pos++;
 
 	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
-	if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
-		goto more;
-	dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-	err = -EAGAIN;
+	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	spin_lock(&dcache_lock);
+	p = p->prev;	/* advance to next dentry */
+	goto more;
 
 out_unlock:
 	spin_unlock(&dcache_lock);
-
-	if (last) {
-		spin_unlock(&inode->i_lock);
+out:
+	if (last)
 		dput(last);
-		spin_lock(&inode->i_lock);
-	}
-
 	return err;
 }
 
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct ceph_file_info *fi = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_inode_to_client(inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned frag = fpos_frag(filp->f_pos);
 	int off = fpos_off(filp->f_pos);
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
-	const int max_entries = client->mount_args->max_readdir;
-	const int max_bytes = client->mount_args->max_readdir_bytes;
+	const int max_entries = fsc->mount_options->max_readdir;
+	const int max_bytes = fsc->mount_options->max_readdir_bytes;
 
 	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
 	if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	/* can we use the dcache? */
 	spin_lock(&inode->i_lock);
 	if ((filp->f_pos == 2 || fi->dentry) &&
-	    !ceph_test_opt(client, NOASYNCREADDIR) &&
+	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
 	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		spin_unlock(&inode->i_lock);
 		err = __dcache_readdir(filp, dirent, filldir);
-		if (err != -EAGAIN) {
-			spin_unlock(&inode->i_lock);
+		if (err != -EAGAIN)
 			return err;
-		}
+	} else {
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	if (fi->dentry) {
 		err = note_last_dentry(fi, fi->dentry->d_name.name,
 				       fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 				  struct dentry *dentry, int err)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *parent = dentry->d_parent->d_inode;
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
-	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
 	    strcmp(dentry->d_name.name,
-		   client->mount_args->snapdir_name) == 0) {
+		   fsc->mount_options->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);
 		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
 		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 				  struct nameidata *nd)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int op;
 	int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		spin_lock(&dir->i_lock);
 		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
 		if (strncmp(dentry->d_name.name,
-			    client->mount_args->snapdir_name,
+			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
 		    !is_root_ceph_dentry(dir, dentry) &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 		      int mode, dev_t rdev)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 			    const char *dest)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 
 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err = -EROFS;
 	int op;
@@ -758,8 +749,8 @@ out:
 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 		     struct dentry *dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
  */
 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct ceph_mds_request *req;
 	int err = -EROFS;
@@ -854,8 +845,8 @@ out:
 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		       struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
 
-	if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
 	if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
 	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_add_tail(&di->lru, &mdsc->dentry_lru);
 		mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
 	     dn->d_name.len, dn->d_name.name, di->offset);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_move_tail(&di->lru, &mdsc->dentry_lru);
 		spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_del_init(&di->lru);
 		mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e38423e82f2..2297d942699 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
 
 #include "super.h"
+#include "mds_client.h"
 
 /*
  * NFS export support
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
 				      struct ceph_nfs_confh *cfh)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
 	struct dentry *dentry;
 	struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba2..e77c28cf369 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -38,8 +39,8 @@
 static struct ceph_mds_request *
 prepare_open_request(struct super_block *sb, int flags, int create_mode)
 {
-	struct ceph_client *client = ceph_sb_to_client(sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int want_auth = USE_ANY_MDS;
 	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 int ceph_open(struct inode *inode, struct file *file)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *cf = file->private_data;
 	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd, int mode,
 				int locked_dir)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct file *file = nd->intent.open.file;
 	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
 	struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
 }
 
 /*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
-					    int num_pages,
-					    loff_t off, size_t len)
-{
-	struct page **pages;
-	int rc;
-
-	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	down_read(&current->mm->mmap_sem);
-	rc = get_user_pages(current, current->mm, (unsigned long)data,
-			    num_pages, 0, 0, pages, NULL);
-	up_read(&current->mm->mmap_sem);
-	if (rc < 0)
-		goto fail;
-	return pages;
-
-fail:
-	kfree(pages);
-	return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
-	int i;
-
-	for (i = 0; i < num_pages; i++)
-		put_page(pages[i]);
-	kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
-	int i;
-
-	for (i = 0; i < num_pages; i++)
-		__free_pages(pages[i], 0);
-	kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
-	struct page **pages;
-	int i;
-
-	pages = kmalloc(sizeof(*pages) * num_pages, flags);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < num_pages; i++) {
-		pages[i] = __page_cache_alloc(flags);
-		if (pages[i] == NULL) {
-			ceph_release_page_vector(pages, i);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-	return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
-				    const char __user *data,
-				    loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, PAGE_CACHE_SIZE-po, left);
-		bad = copy_from_user(page_address(pages[i]) + po, data, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		po += l - bad;
-		if (po == PAGE_CACHE_SIZE) {
-			po = 0;
-			i++;
-		}
-	}
-	return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
-				    loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, left, PAGE_CACHE_SIZE-po);
-		bad = copy_to_user(data, page_address(pages[i]) + po, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		if (po) {
-			po += l - bad;
-			if (po == PAGE_CACHE_SIZE)
-				po = 0;
-		}
-		i++;
-	}
-	return len;
-}
-
-/*
- * Zero an extent within a page vector.  Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
-	int i = off >> PAGE_CACHE_SHIFT;
-
-	off &= ~PAGE_CACHE_MASK;
-
-	dout("zero_page_vector_page %u~%u\n", off, len);
-
-	/* leading partial page? */
-	if (off) {
-		int end = min((int)PAGE_CACHE_SIZE, off + len);
-		dout("zeroing %d %p head from %d\n", i, pages[i],
-		     (int)off);
-		zero_user_segment(pages[i], off, end);
-		len -= (end - off);
-		i++;
-	}
-	while (len >= PAGE_CACHE_SIZE) {
-		dout("zeroing %d %p len=%d\n", i, pages[i], len);
-		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-		i++;
-	}
-	/* trailing partial page? */
-	if (len) {
-		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-		zero_user_segment(pages[i], 0, len);
-	}
-}
-
-
-/*
  * Read a range of bytes striped over one or more objects.  Iterate over
  * objects we stripe over.  (That's not atomic, but good enough for now.)
  *
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
 			struct page **pages, int num_pages,
 			int *checkeof)
 {
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 pos, this_len;
 	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
 
 more:
 	this_len = left;
-	ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
 				  &ci->i_layout, pos, &this_len,
 				  ci->i_truncate_seq,
 				  ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
 
 		if (read < pos - off) {
 			dout(" zero gap %llu to %llu\n", off + read, pos);
-			zero_page_vector_range(page_off + read,
-					       pos - off - read, pages);
+			ceph_zero_page_vector_range(page_off + read,
+						    pos - off - read, pages);
 		}
 		pos += ret;
 		read = pos - off;
@@ -495,8 +339,8 @@ more:
 		/* was original extent fully inside i_size? */
 		if (pos + left <= inode->i_size) {
 			dout("zero tail\n");
-			zero_page_vector_range(page_off + read, len - read,
-					       pages);
+			ceph_zero_page_vector_range(page_off + read, len - read,
+						    pages);
 			read = len;
 			goto out;
 		}
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
 
 	if (file->f_flags & O_DIRECT) {
-		pages = get_direct_page_vector(data, num_pages, off, len);
+		pages = ceph_get_direct_page_vector(data, num_pages, off, len);
 
 		/*
 		 * flush any page cache pages in this range.  this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	ret = striped_read(inode, off, len, pages, num_pages, checkeof);
 
 	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-		ret = copy_page_vector_to_user(pages, data, off, ret);
+		ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
 	if (ret >= 0)
 		*poff = off + ret;
 
 done:
 	if (file->f_flags & O_DIRECT)
-		put_page_vector(pages, num_pages);
+		ceph_put_page_vector(pages, num_pages);
 	else
 		ceph_release_page_vector(pages, num_pages);
 	dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_osd_request *req;
 	struct page **pages;
 	int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	 */
 more:
 	len = left;
-	req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
 				    ceph_vino(inode), pos, &len,
 				    CEPH_OSD_OP_WRITE, flags,
 				    ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
 	num_pages = calc_pages_for(pos, len);
 
 	if (file->f_flags & O_DIRECT) {
-		pages = get_direct_page_vector(data, num_pages, pos, len);
+		pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
@@ -673,7 +517,7 @@ more:
 			ret = PTR_ERR(pages);
 			goto out;
 		}
-		ret = copy_user_to_page_vector(pages, data, pos, len);
+		ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
 		if (ret < 0) {
 			ceph_release_page_vector(pages, num_pages);
 			goto out;
@@ -689,7 +533,7 @@ more:
 	req->r_num_pages = num_pages;
 	req->r_inode = inode;
 
-	ret = ceph_osdc_start_request(&client->osdc, req, false);
+	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		if (req->r_safe_callback) {
 			/*
@@ -701,11 +545,11 @@ more:
 			spin_unlock(&ci->i_unsafe_lock);
 			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
 		}
-		ret = ceph_osdc_wait_request(&client->osdc, req);
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	}
 
 	if (file->f_flags & O_DIRECT)
-		put_page_vector(pages, num_pages);
+		ceph_put_page_vector(pages, num_pages);
 	else if (file->f_flags & O_SYNC)
 		ceph_release_page_vector(pages, num_pages);
 
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	loff_t endoff = pos + iov->iov_len;
 	int want, got = 0;
 	int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37ed..1d6a45b5a04 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/module.h>
 #include <linux/fs.h>
@@ -13,7 +13,8 @@
 #include <linux/pagevec.h>
 
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 
 /*
  * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 		struct ceph_snap_realm *realm = ci->i_snap_realm;
 
 		dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
 		}
 
 		/* it may be better to set st_size in getattr instead? */
-		if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
 			inode->i_size = ci->i_rbytes;
 		break;
 	default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	struct inode *in = NULL;
 	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
-	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 	int i = 0;
 	int err = 0;
 
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	 */
 	if (rinfo->head->is_dentry && !req->r_aborted &&
 	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
-					       client->mount_args->snapdir_name,
+					       fsc->mount_options->snapdir_name,
 					       req->r_dentry->d_name.len))) {
 		/*
 		 * lookup link rename   : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
@@ -1728,8 +1729,8 @@ out:
  */
 int ceph_do_getattr(struct inode *inode, int mask)
 {
-	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba..8888c9ba68d 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
 #include <linux/in.h>
 
-#include "ioctl.h"
 #include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+
+#include "ioctl.h"
 
 
 /*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
 	int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 }
 
 /*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	        (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+				       USE_AUTH_MDS);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+			cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+			cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+			cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool =
+			cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+			cpu_to_le32(l.preferred_osd);
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
  * Return object name, size/offset information, and location (OSD
  * number, network address) for a given file offset.
  */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	struct ceph_ioctl_dataloc dl;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case CEPH_IOC_SET_LAYOUT:
 		return ceph_ioctl_set_layout(file, (void __user *)arg);
 
+	case CEPH_IOC_SET_LAYOUT_POLICY:
+		return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
 	case CEPH_IOC_GET_DATALOC:
 		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
 
 	case CEPH_IOC_LAZYIO:
 		return ceph_ioctl_lazyio(file);
 	}
+
 	return -ENOTTY;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b685..a6ce54e94eb 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
-#define CEPH_IOCTL_MAGIC 0x97
+#define CEPH_IOCTL_MAGIC 0x98
 
 /* just use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
 				   struct ceph_ioctl_layout)
 #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
 				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,	\
+				   struct ceph_ioctl_layout)
 
 /*
  * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae9..40abde93c34 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/file.h>
 #include <linux/namei.h>
 
 #include "super.h"
 #include "mds_client.h"
-#include "pagelist.h"
+#include <linux/ceph/pagelist.h>
 
 /**
  * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(inode->i_sb)->mdsc;
+		ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
  * Encode the flock and fcntl locks for the given inode into the pagelist.
  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
  * sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
- * been gathered under the same lock holding window.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
  */
 int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 		      int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 	struct file_lock *lock;
 	struct ceph_filelock cephlock;
 	int err = 0;
+	int seen_fcntl = 0;
+	int seen_flock = 0;
 
 	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
 	     num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 		goto fail;
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_POSIX) {
+			++seen_fcntl;
+			if (seen_fcntl > num_fcntl_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
 			err = lock_to_ceph_filelock(lock, &cephlock);
 			if (err)
 				goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 		goto fail;
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_FLOCK) {
+			++seen_flock;
+			if (seen_flock > num_flock_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
 			err = lock_to_ceph_filelock(lock, &cephlock);
 			if (err)
 				goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f260..3142b15940c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/smp_lock.h>
 
-#include "mds_client.h"
-#include "mon_client.h"
 #include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
+#include "mds_client.h"
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 
 /*
  * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
 		if (s->s_authorizer)
-			s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
-				s->s_mdsc->client->monc.auth, s->s_authorizer);
+		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+			     s->s_mdsc->fsc->client->monc.auth,
+			     s->s_authorizer);
 		kfree(s);
 	}
 }
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 
-	ceph_con_init(mdsc->client->msgr, &s->s_con);
+	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
 	s->s_con.private = s;
 	s->s_con.ops = &mds_con_ops;
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	} else if (req->r_dentry) {
 		struct inode *dir = req->r_dentry->d_parent->d_inode;
 
-		if (dir->i_sb != mdsc->client->sb) {
+		if (dir->i_sb != mdsc->fsc->sb) {
 			/* not this fs! */
 			inode = req->r_dentry->d_inode;
 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	__ceph_remove_cap(cap);
 	if (!__ceph_is_any_real_caps(ci)) {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(inode->i_sb)->mdsc;
+			ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&mdsc->cap_dirty_lock);
 		if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 	struct ceph_msg *msg, *partial = NULL;
 	struct ceph_mds_cap_release *head;
 	int err = -ENOMEM;
-	int extra = mdsc->client->mount_args->cap_release_safety;
+	int extra = mdsc->fsc->mount_options->cap_release_safety;
 	int num;
 
 	dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 	/* insert trace into our cache */
 	mutex_lock(&req->r_fill_mutex);
-	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
 		if (result == 0 && rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
 	if (recon_state->flock) {
 		int num_fcntl_locks, num_flock_locks;
-
-		lock_kernel();
-		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-		rec.v2.flock_len = (2*sizeof(u32) +
-				    (num_fcntl_locks+num_flock_locks) *
-				    sizeof(struct ceph_filelock));
-
-		err = ceph_pagelist_append(pagelist, &rec, reclen);
-		if (!err)
-			err = ceph_encode_locks(inode, pagelist,
-						num_fcntl_locks,
-						num_flock_locks);
-		unlock_kernel();
+		struct ceph_pagelist_cursor trunc_point;
+
+		ceph_pagelist_set_cursor(pagelist, &trunc_point);
+		do {
+			lock_flocks();
+			ceph_count_locks(inode, &num_fcntl_locks,
+					 &num_flock_locks);
+			rec.v2.flock_len = (2*sizeof(u32) +
+					    (num_fcntl_locks+num_flock_locks) *
+					    sizeof(struct ceph_filelock));
+			unlock_flocks();
+
+			/* pre-alloc pagelist */
+			ceph_pagelist_truncate(pagelist, &trunc_point);
+			err = ceph_pagelist_append(pagelist, &rec, reclen);
+			if (!err)
+				err = ceph_pagelist_reserve(pagelist,
+							    rec.v2.flock_len);
+
+			/* encode locks */
+			if (!err) {
+				lock_flocks();
+				err = ceph_encode_locks(inode,
+							pagelist,
+							num_fcntl_locks,
+							num_flock_locks);
+				unlock_flocks();
+			}
+		} while (err == -ENOSPC);
 	} else {
 		err = ceph_pagelist_append(pagelist, &rec, reclen);
 	}
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session,
 			 struct ceph_msg *msg)
 {
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
 	schedule_delayed(mdsc);
 }
 
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
 
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 {
-	mdsc->client = client;
+	struct ceph_mds_client *mdsc;
+
+	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+	if (!mdsc)
+		return -ENOMEM;
+	mdsc->fsc = fsc;
+	fsc->mdsc = mdsc;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
 	if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	INIT_LIST_HEAD(&mdsc->dentry_lru);
 
 	ceph_caps_init(mdsc);
-	ceph_adjust_min_caps(mdsc, client->min_caps);
+	ceph_adjust_min_caps(mdsc, fsc->min_caps);
 
 	return 0;
 }
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_request *req;
-	struct ceph_client *client = mdsc->client;
+	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    client->mount_args->mount_timeout * HZ);
+				    fsc->client->options->mount_timeout * HZ);
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
-	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
 	int i, n = 0;
 
-	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return true;
 
 	mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_session *session;
 	int i;
-	struct ceph_client *client = mdsc->client;
-	unsigned long timeout = client->mount_args->mount_timeout * HZ;
+	struct ceph_fs_client *fsc = mdsc->fsc;
+	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	dout("stopped\n");
 }
 
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 	ceph_caps_finalize(mdsc);
 }
 
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	ceph_mdsc_stop(mdsc);
+	fsc->mdsc = NULL;
+	kfree(mdsc);
+}
+
 
 /*
  * handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
 		return;
 	epoch = ceph_decode_32(&p);
 	maplen = ceph_decode_32(&p);
 	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
 
 	/* do we need it? */
-	ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+	ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
 	mutex_lock(&mdsc->mutex);
 	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
 		dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	} else {
 		mdsc->mdsmap = newmap;  /* first mds map */
 	}
-	mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+	mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
 
 	__wake_requests(mdsc, &mdsc->waiting_for_map);
 
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	int ret = 0;
 
 	if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
 	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
 }
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
 	if (ac->ops->invalidate_authorizer)
 		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
 
-	return ceph_monc_validate_auth(&mdsc->client->monc);
+	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
 static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.peer_reset = peer_reset,
 };
 
-
-
-
 /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2..d66d63c7235 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
 
 /*
  * Some lock dependencies:
@@ -26,7 +26,7 @@
  *
  */
 
-struct ceph_client;
+struct ceph_fs_client;
 struct ceph_cap;
 
 /*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
  * mds client state
  */
 struct ceph_mds_client {
-	struct ceph_client      *client;
+	struct ceph_fs_client  *fsc;
 	struct mutex            mutex;         /* all nested structures */
 
 	struct ceph_mdsmap      *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
 	int		caps_avail_count;    /* unused, unreserved */
 	int		caps_min_count;      /* keep at least this many
 						(unreserved) */
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry 	  *debugfs_file;
-#endif
-
 	spinlock_t	  dentry_lru_lock;
 	struct list_head  dentry_lru;
 	int		  num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 			     struct ceph_msg *msg, int mds);
 
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
-			   struct ceph_client *client);
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150..73b7d44e8a3 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/bug.h>
 #include <linux/err.h>
@@ -6,9 +6,9 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
 
 #include "super.h"
 
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		}
 
 		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
-		     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+		     i+1, n, global_id, mds, inc,
+		     ceph_pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
 		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
 			m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce..00000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-
-#include "pagelist.h"
-
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
-	struct page *page = list_entry(pl->head.prev, struct page,
-				       lru);
-	kunmap(page);
-}
-
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
-	if (pl->mapped_tail)
-		ceph_pagelist_unmap_tail(pl);
-
-	while (!list_empty(&pl->head)) {
-		struct page *page = list_first_entry(&pl->head, struct page,
-						     lru);
-		list_del(&page->lru);
-		__free_page(page);
-	}
-	return 0;
-}
-
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
-	struct page *page = __page_cache_alloc(GFP_NOFS);
-	if (!page)
-		return -ENOMEM;
-	pl->room += PAGE_SIZE;
-	list_add_tail(&page->lru, &pl->head);
-	if (pl->mapped_tail)
-		ceph_pagelist_unmap_tail(pl);
-	pl->mapped_tail = kmap(page);
-	return 0;
-}
-
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
-	while (pl->room < len) {
-		size_t bit = pl->room;
-		int ret;
-
-		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
-		       buf, bit);
-		pl->length += bit;
-		pl->room -= bit;
-		buf += bit;
-		len -= bit;
-		ret = ceph_pagelist_addpage(pl);
-		if (ret)
-			return ret;
-	}
-
-	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
-	pl->length += len;
-	pl->room -= len;
-	return 0;
-}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2..39c243acd06 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/sort.h>
 #include <linux/slab.h>
 
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
 
 /*
  * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 			    struct ceph_cap_snap *capsnap)
 {
 	struct inode *inode = &ci->vfs_inode;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 		      struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	int mds = session->s_mds;
 	u64 split;
 	int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a..cd5097d7c80 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
 /*
- * Ceph string constants
+ * Ceph fs string constants
  */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 
-const char *ceph_entity_type_name(int type)
-{
-	switch (type) {
-	case CEPH_ENTITY_TYPE_MDS: return "mds";
-	case CEPH_ENTITY_TYPE_OSD: return "osd";
-	case CEPH_ENTITY_TYPE_MON: return "mon";
-	case CEPH_ENTITY_TYPE_CLIENT: return "client";
-	case CEPH_ENTITY_TYPE_AUTH: return "auth";
-	default: return "unknown";
-	}
-}
-
-const char *ceph_osd_op_name(int op)
-{
-	switch (op) {
-	case CEPH_OSD_OP_READ: return "read";
-	case CEPH_OSD_OP_STAT: return "stat";
-
-	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
-	case CEPH_OSD_OP_WRITE: return "write";
-	case CEPH_OSD_OP_DELETE: return "delete";
-	case CEPH_OSD_OP_TRUNCATE: return "truncate";
-	case CEPH_OSD_OP_ZERO: return "zero";
-	case CEPH_OSD_OP_WRITEFULL: return "writefull";
-	case CEPH_OSD_OP_ROLLBACK: return "rollback";
-
-	case CEPH_OSD_OP_APPEND: return "append";
-	case CEPH_OSD_OP_STARTSYNC: return "startsync";
-	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
-	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
-	case CEPH_OSD_OP_TMAPUP: return "tmapup";
-	case CEPH_OSD_OP_TMAPGET: return "tmapget";
-	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-
-	case CEPH_OSD_OP_GETXATTR: return "getxattr";
-	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
-	case CEPH_OSD_OP_SETXATTR: return "setxattr";
-	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
-	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
-	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-
-	case CEPH_OSD_OP_PULL: return "pull";
-	case CEPH_OSD_OP_PUSH: return "push";
-	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
-	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
-	case CEPH_OSD_OP_SCRUB: return "scrub";
-
-	case CEPH_OSD_OP_WRLOCK: return "wrlock";
-	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
-	case CEPH_OSD_OP_RDLOCK: return "rdlock";
-	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
-	case CEPH_OSD_OP_UPLOCK: return "uplock";
-	case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
-	case CEPH_OSD_OP_CALL: return "call";
-
-	case CEPH_OSD_OP_PGLS: return "pgls";
-	}
-	return "???";
-}
 
 const char *ceph_mds_state_name(int s)
 {
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
 	}
 	return "???";
 }
-
-const char *ceph_pool_op_name(int op)
-{
-	switch (op) {
-	case POOL_OP_CREATE: return "create";
-	case POOL_OP_DELETE: return "delete";
-	case POOL_OP_AUID_CHANGE: return "auid change";
-	case POOL_OP_CREATE_SNAP: return "create snap";
-	case POOL_OP_DELETE_SNAP: return "delete snap";
-	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-	}
-	return "???";
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b..d6e0e042189 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/backing-dev.h>
 #include <linux/ctype.h>
@@ -15,10 +15,13 @@
 #include <linux/statfs.h>
 #include <linux/string.h>
 
-#include "decode.h"
 #include "super.h"
-#include "mon_client.h"
-#include "auth.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 
 /*
  * Ceph superblock operations
@@ -26,36 +29,22 @@
  * Handle the basics of mounting, unmounting.
  */
 
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-	const char *e = s + len;
-
-	while (e != s && *(e-1) != '/')
-		e--;
-	return e;
-}
-
-
 /*
  * super ops
  */
 static void ceph_put_super(struct super_block *s)
 {
-	struct ceph_client *client = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 
 	dout("put_super\n");
-	ceph_mdsc_close_sessions(&client->mdsc);
+	ceph_mdsc_close_sessions(fsc->mdsc);
 
 	/*
 	 * ensure we release the bdi before put_anon_super releases
 	 * the device name.
 	 */
-	if (s->s_bdi == &client->backing_dev_info) {
-		bdi_unregister(&client->backing_dev_info);
+	if (s->s_bdi == &fsc->backing_dev_info) {
+		bdi_unregister(&fsc->backing_dev_info);
 		s->s_bdi = NULL;
 	}
 
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
 
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
-	struct ceph_monmap *monmap = client->monc.monmap;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+	struct ceph_monmap *monmap = fsc->client->monc.monmap;
 	struct ceph_statfs st;
 	u64 fsid;
 	int err;
 
 	dout("statfs\n");
-	err = ceph_monc_do_statfs(&client->monc, &st);
+	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
 	if (err < 0)
 		return err;
 
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 
 	if (!wait) {
 		dout("sync_fs (non-blocking)\n");
-		ceph_flush_dirty_caps(&client->mdsc);
+		ceph_flush_dirty_caps(fsc->mdsc);
 		dout("sync_fs (non-blocking) done\n");
 		return 0;
 	}
 
 	dout("sync_fs (blocking)\n");
-	ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-	ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+	ceph_osdc_sync(&fsc->client->osdc);
+	ceph_mdsc_sync(fsc->mdsc);
 	dout("sync_fs (blocking) done\n");
 	return 0;
 }
 
-static int default_congestion_kb(void)
-{
-	int congestion_kb;
-
-	/*
-	 * Copied from NFS
-	 *
-	 * congestion size, scale with available memory.
-	 *
-	 *  64MB:    8192k
-	 * 128MB:   11585k
-	 * 256MB:   16384k
-	 * 512MB:   23170k
-	 *   1GB:   32768k
-	 *   2GB:   46340k
-	 *   4GB:   65536k
-	 *   8GB:   92681k
-	 *  16GB:  131072k
-	 *
-	 * This allows larger machines to have larger/more transfers.
-	 * Limit the default to 256M
-	 */
-	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-	if (congestion_kb > 256*1024)
-		congestion_kb = 256*1024;
-
-	return congestion_kb;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-	struct ceph_mount_args *args = client->mount_args;
-
-	if (args->flags & CEPH_OPT_FSID)
-		seq_printf(m, ",fsid=%pU", &args->fsid);
-	if (args->flags & CEPH_OPT_NOSHARE)
-		seq_puts(m, ",noshare");
-	if (args->flags & CEPH_OPT_DIRSTAT)
-		seq_puts(m, ",dirstat");
-	if ((args->flags & CEPH_OPT_RBYTES) == 0)
-		seq_puts(m, ",norbytes");
-	if (args->flags & CEPH_OPT_NOCRC)
-		seq_puts(m, ",nocrc");
-	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-		seq_puts(m, ",noasyncreaddir");
-
-	if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-		seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
-	if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-		seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
-	if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-		seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
-	if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-		seq_printf(m, ",osdkeepalivetimeout=%d",
-			 args->osd_keepalive_timeout);
-	if (args->wsize)
-		seq_printf(m, ",wsize=%d", args->wsize);
-	if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
-		seq_printf(m, ",rsize=%d", args->rsize);
-	if (args->congestion_kb != default_congestion_kb())
-		seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
-	if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-		seq_printf(m, ",caps_wanted_delay_min=%d",
-			 args->caps_wanted_delay_min);
-	if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-		seq_printf(m, ",caps_wanted_delay_max=%d",
-			   args->caps_wanted_delay_max);
-	if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-		seq_printf(m, ",cap_release_safety=%d",
-			   args->cap_release_safety);
-	if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-		seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
-	if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-		seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
-	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-	if (args->name)
-		seq_printf(m, ",name=%s", args->name);
-	if (args->secret)
-		seq_puts(m, ",secret=<hidden>");
-	return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
-	struct ceph_inode_info *ci = foo;
-	inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
-	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-				      sizeof(struct ceph_inode_info),
-				      __alignof__(struct ceph_inode_info),
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      ceph_inode_init_once);
-	if (ceph_inode_cachep == NULL)
-		return -ENOMEM;
-
-	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
-				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_cap_cachep == NULL)
-		goto bad_cap;
-
-	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_dentry_cachep == NULL)
-		goto bad_dentry;
-
-	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_file_cachep == NULL)
-		goto bad_file;
-
-	return 0;
-
-bad_file:
-	kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
-	kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
-	kmem_cache_destroy(ceph_inode_cachep);
-	return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
-	kmem_cache_destroy(ceph_inode_cachep);
-	kmem_cache_destroy(ceph_cap_cachep);
-	kmem_cache_destroy(ceph_dentry_cachep);
-	kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
-	struct ceph_client *client = ceph_sb_to_client(sb);
-
-	dout("ceph_umount_begin - starting forced umount\n");
-	if (!client)
-		return;
-	client->mount_state = CEPH_MOUNT_SHUTDOWN;
-	return;
-}
-
-static const struct super_operations ceph_super_ops = {
-	.alloc_inode	= ceph_alloc_inode,
-	.destroy_inode	= ceph_destroy_inode,
-	.write_inode    = ceph_write_inode,
-	.sync_fs        = ceph_sync_fs,
-	.put_super	= ceph_put_super,
-	.show_options   = ceph_show_options,
-	.statfs		= ceph_statfs,
-	.umount_begin   = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
-	switch (type) {
-	case CEPH_MSG_SHUTDOWN: return "shutdown";
-	case CEPH_MSG_PING: return "ping";
-	case CEPH_MSG_AUTH: return "auth";
-	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
-	case CEPH_MSG_MON_MAP: return "mon_map";
-	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
-	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
-	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-	case CEPH_MSG_STATFS: return "statfs";
-	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-	case CEPH_MSG_MDS_MAP: return "mds_map";
-	case CEPH_MSG_CLIENT_SESSION: return "client_session";
-	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
-	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
-	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
-	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
-	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
-	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
-	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
-	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-	case CEPH_MSG_OSD_MAP: return "osd_map";
-	case CEPH_MSG_OSD_OP: return "osd_op";
-	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-	default: return "unknown";
-	}
-}
-
-
 /*
  * mount options
  */
 enum {
 	Opt_wsize,
 	Opt_rsize,
-	Opt_osdtimeout,
-	Opt_osdkeepalivetimeout,
-	Opt_mount_timeout,
-	Opt_osd_idle_ttl,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
 	Opt_congestion_kb,
 	Opt_last_int,
 	/* int args above */
-	Opt_fsid,
 	Opt_snapdirname,
-	Opt_name,
-	Opt_secret,
 	Opt_last_string,
 	/* string args above */
-	Opt_ip,
-	Opt_noshare,
 	Opt_dirstat,
 	Opt_nodirstat,
 	Opt_rbytes,
 	Opt_norbytes,
-	Opt_nocrc,
 	Opt_noasyncreaddir,
 };
 
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
 	{Opt_wsize, "wsize=%d"},
 	{Opt_rsize, "rsize=%d"},
-	{Opt_osdtimeout, "osdtimeout=%d"},
-	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
-	{Opt_mount_timeout, "mount_timeout=%d"},
-	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
 	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
-	{Opt_fsid, "fsid=%s"},
 	{Opt_snapdirname, "snapdirname=%s"},
-	{Opt_name, "name=%s"},
-	{Opt_secret, "secret=%s"},
 	/* string args above */
-	{Opt_ip, "ip=%s"},
-	{Opt_noshare, "noshare"},
 	{Opt_dirstat, "dirstat"},
 	{Opt_nodirstat, "nodirstat"},
 	{Opt_rbytes, "rbytes"},
 	{Opt_norbytes, "norbytes"},
-	{Opt_nocrc, "nocrc"},
 	{Opt_noasyncreaddir, "noasyncreaddir"},
 	{-1, NULL}
 };
 
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
 {
-	int i = 0;
-	char tmp[3];
-	int err = -EINVAL;
-	int d;
-
-	dout("parse_fsid '%s'\n", str);
-	tmp[2] = 0;
-	while (*str && i < 16) {
-		if (ispunct(*str)) {
-			str++;
-			continue;
+	struct ceph_mount_options *fsopt = private;
+	substring_t argstr[MAX_OPT_ARGS];
+	int token, intval, ret;
+
+	token = match_token((char *)c, fsopt_tokens, argstr);
+	if (token < 0)
+		return -EINVAL;
+
+	if (token < Opt_last_int) {
+		ret = match_int(&argstr[0], &intval);
+		if (ret < 0) {
+			pr_err("bad mount option arg (not int) "
+			       "at '%s'\n", c);
+			return ret;
 		}
-		if (!isxdigit(str[0]) || !isxdigit(str[1]))
-			break;
-		tmp[0] = str[0];
-		tmp[1] = str[1];
-		if (sscanf(tmp, "%x", &d) < 1)
-			break;
-		fsid->fsid[i] = d & 0xff;
-		i++;
-		str += 2;
+		dout("got int token %d val %d\n", token, intval);
+	} else if (token > Opt_last_int && token < Opt_last_string) {
+		dout("got string token %d val %s\n", token,
+		     argstr[0].from);
+	} else {
+		dout("got token %d\n", token);
 	}
 
-	if (i == 16)
-		err = 0;
-	dout("parse_fsid ret %d got fsid %pU", err, fsid);
-	return err;
+	switch (token) {
+	case Opt_snapdirname:
+		kfree(fsopt->snapdir_name);
+		fsopt->snapdir_name = kstrndup(argstr[0].from,
+					       argstr[0].to-argstr[0].from,
+					       GFP_KERNEL);
+		if (!fsopt->snapdir_name)
+			return -ENOMEM;
+		break;
+
+		/* misc */
+	case Opt_wsize:
+		fsopt->wsize = intval;
+		break;
+	case Opt_rsize:
+		fsopt->rsize = intval;
+		break;
+	case Opt_caps_wanted_delay_min:
+		fsopt->caps_wanted_delay_min = intval;
+		break;
+	case Opt_caps_wanted_delay_max:
+		fsopt->caps_wanted_delay_max = intval;
+		break;
+	case Opt_readdir_max_entries:
+		fsopt->max_readdir = intval;
+		break;
+	case Opt_readdir_max_bytes:
+		fsopt->max_readdir_bytes = intval;
+		break;
+	case Opt_congestion_kb:
+		fsopt->congestion_kb = intval;
+		break;
+	case Opt_dirstat:
+		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_nodirstat:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_rbytes:
+		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_norbytes:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_noasyncreaddir:
+		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
+	default:
+		BUG_ON(token);
+	}
+	return 0;
 }
 
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
-						const char *dev_name,
-						const char **path)
+static void destroy_mount_options(struct ceph_mount_options *args)
 {
-	struct ceph_mount_args *args;
-	const char *c;
-	int err = -ENOMEM;
-	substring_t argstr[MAX_OPT_ARGS];
+	dout("destroy_mount_options %p\n", args);
+	kfree(args->snapdir_name);
+	kfree(args);
+}
 
-	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return ERR_PTR(-ENOMEM);
-	args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
-				 GFP_KERNEL);
-	if (!args->mon_addr)
-		goto out;
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
 
-	dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
-
-	/* start with defaults */
-	args->sb_flags = flags;
-	args->flags = CEPH_OPT_DEFAULT;
-	args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
-	args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
-	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
-	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
-	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-	args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-	args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-	args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-	args->congestion_kb = default_congestion_kb();
-
-	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-	err = -EINVAL;
-	if (!dev_name)
-		goto out;
-	*path = strstr(dev_name, ":/");
-	if (*path == NULL) {
-		pr_err("device name is missing path (no :/ in %s)\n",
-		       dev_name);
-		goto out;
-	}
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+				 struct ceph_options *new_opt,
+				 struct ceph_fs_client *fsc)
+{
+	struct ceph_mount_options *fsopt1 = new_fsopt;
+	struct ceph_mount_options *fsopt2 = fsc->mount_options;
+	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+	int ret;
 
-	/* get mon ip(s) */
-	err = ceph_parse_ips(dev_name, *path, args->mon_addr,
-			     CEPH_MAX_MON, &args->num_mon);
-	if (err < 0)
-		goto out;
+	ret = memcmp(fsopt1, fsopt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+	if (ret)
+		return ret;
+
+	return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+			       struct ceph_options **popt,
+			       int flags, char *options,
+			       const char *dev_name,
+			       const char **path)
+{
+	struct ceph_mount_options *fsopt;
+	const char *dev_name_end;
+	int err = -ENOMEM;
+
+	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+	if (!fsopt)
+		return -ENOMEM;
+
+	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+	
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+	dev_name_end = *path;
+	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 
 	/* path on server */
 	*path += 2;
 	dout("server path '%s'\n", *path);
 
-	/* parse mount options */
-	while ((c = strsep(&options, ",")) != NULL) {
-		int token, intval, ret;
-		if (!*c)
-			continue;
-		err = -EINVAL;
-		token = match_token((char *)c, arg_tokens, argstr);
-		if (token < 0) {
-			pr_err("bad mount option at '%s'\n", c);
-			goto out;
-		}
-		if (token < Opt_last_int) {
-			ret = match_int(&argstr[0], &intval);
-			if (ret < 0) {
-				pr_err("bad mount option arg (not int) "
-				       "at '%s'\n", c);
-				continue;
-			}
-			dout("got int token %d val %d\n", token, intval);
-		} else if (token > Opt_last_int && token < Opt_last_string) {
-			dout("got string token %d val %s\n", token,
-			     argstr[0].from);
-		} else {
-			dout("got token %d\n", token);
-		}
-		switch (token) {
-		case Opt_ip:
-			err = ceph_parse_ips(argstr[0].from,
-					     argstr[0].to,
-					     &args->my_addr,
-					     1, NULL);
-			if (err < 0)
-				goto out;
-			args->flags |= CEPH_OPT_MYIP;
-			break;
-
-		case Opt_fsid:
-			err = parse_fsid(argstr[0].from, &args->fsid);
-			if (err == 0)
-				args->flags |= CEPH_OPT_FSID;
-			break;
-		case Opt_snapdirname:
-			kfree(args->snapdir_name);
-			args->snapdir_name = kstrndup(argstr[0].from,
-					      argstr[0].to-argstr[0].from,
-					      GFP_KERNEL);
-			break;
-		case Opt_name:
-			args->name = kstrndup(argstr[0].from,
-					      argstr[0].to-argstr[0].from,
-					      GFP_KERNEL);
-			break;
-		case Opt_secret:
-			args->secret = kstrndup(argstr[0].from,
-						argstr[0].to-argstr[0].from,
-						GFP_KERNEL);
-			break;
-
-			/* misc */
-		case Opt_wsize:
-			args->wsize = intval;
-			break;
-		case Opt_rsize:
-			args->rsize = intval;
-			break;
-		case Opt_osdtimeout:
-			args->osd_timeout = intval;
-			break;
-		case Opt_osdkeepalivetimeout:
-			args->osd_keepalive_timeout = intval;
-			break;
-		case Opt_osd_idle_ttl:
-			args->osd_idle_ttl = intval;
-			break;
-		case Opt_mount_timeout:
-			args->mount_timeout = intval;
-			break;
-		case Opt_caps_wanted_delay_min:
-			args->caps_wanted_delay_min = intval;
-			break;
-		case Opt_caps_wanted_delay_max:
-			args->caps_wanted_delay_max = intval;
-			break;
-		case Opt_readdir_max_entries:
-			args->max_readdir = intval;
-			break;
-		case Opt_readdir_max_bytes:
-			args->max_readdir_bytes = intval;
-			break;
-		case Opt_congestion_kb:
-			args->congestion_kb = intval;
-			break;
-
-		case Opt_noshare:
-			args->flags |= CEPH_OPT_NOSHARE;
-			break;
-
-		case Opt_dirstat:
-			args->flags |= CEPH_OPT_DIRSTAT;
-			break;
-		case Opt_nodirstat:
-			args->flags &= ~CEPH_OPT_DIRSTAT;
-			break;
-		case Opt_rbytes:
-			args->flags |= CEPH_OPT_RBYTES;
-			break;
-		case Opt_norbytes:
-			args->flags &= ~CEPH_OPT_RBYTES;
-			break;
-		case Opt_nocrc:
-			args->flags |= CEPH_OPT_NOCRC;
-			break;
-		case Opt_noasyncreaddir:
-			args->flags |= CEPH_OPT_NOASYNCREADDIR;
-			break;
-
-		default:
-			BUG_ON(token);
-		}
-	}
-	return args;
+	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+				 parse_fsopt_token, (void *)fsopt);
+	if (err)
+		goto out;
+
+	/* success */
+	*pfsopt = fsopt;
+	return 0;
 
 out:
-	kfree(args->mon_addr);
-	kfree(args);
-	return ERR_PTR(err);
+	destroy_mount_options(fsopt);
+	return err;
 }
 
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-	dout("destroy_mount_args %p\n", args);
-	kfree(args->snapdir_name);
-	args->snapdir_name = NULL;
-	kfree(args->name);
-	args->name = NULL;
-	kfree(args->secret);
-	args->secret = NULL;
-	kfree(args);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_options *fsopt = fsc->mount_options;
+	struct ceph_options *opt = fsc->client->options;
+
+	if (opt->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsid=%pU", &opt->fsid);
+	if (opt->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (opt->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+
+	if (opt->name)
+		seq_printf(m, ",name=%s", opt->name);
+	if (opt->secret)
+		seq_puts(m, ",secret=<hidden>");
+
+	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, ",osdkeepalivetimeout=%d",
+			   opt->osd_keepalive_timeout);
+
+	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+
+	if (fsopt->wsize)
+		seq_printf(m, ",wsize=%d", fsopt->wsize);
+	if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", fsopt->rsize);
+	if (fsopt->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 fsopt->caps_wanted_delay_min);
+	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   fsopt->caps_wanted_delay_max);
+	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   fsopt->cap_release_safety);
+	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+	return 0;
 }
 
 /*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
  */
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 {
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc = client->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(fsc->mdsc, msg);
+		return 0;
+
+	default:
+		return -1;
+	}
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+					struct ceph_options *opt)
+{
+	struct ceph_fs_client *fsc;
 	int err = -ENOMEM;
 
-	client = kzalloc(sizeof(*client), GFP_KERNEL);
-	if (client == NULL)
+	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+	if (!fsc)
 		return ERR_PTR(-ENOMEM);
 
-	mutex_init(&client->mount_mutex);
-
-	init_waitqueue_head(&client->auth_wq);
+	fsc->client = ceph_create_client(opt, fsc);
+	if (IS_ERR(fsc->client)) {
+		err = PTR_ERR(fsc->client);
+		goto fail;
+	}
+	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+	fsc->client->monc.want_mdsmap = 1;
 
-	client->sb = NULL;
-	client->mount_state = CEPH_MOUNT_MOUNTING;
-	client->mount_args = args;
+	fsc->mount_options = fsopt;
 
-	client->msgr = NULL;
+	fsc->sb = NULL;
+	fsc->mount_state = CEPH_MOUNT_MOUNTING;
 
-	client->auth_err = 0;
-	atomic_long_set(&client->writeback_count, 0);
+	atomic_long_set(&fsc->writeback_count, 0);
 
-	err = bdi_init(&client->backing_dev_info);
+	err = bdi_init(&fsc->backing_dev_info);
 	if (err < 0)
-		goto fail;
+		goto fail_client;
 
 	err = -ENOMEM;
-	client->wb_wq = create_workqueue("ceph-writeback");
-	if (client->wb_wq == NULL)
+	fsc->wb_wq = create_workqueue("ceph-writeback");
+	if (fsc->wb_wq == NULL)
 		goto fail_bdi;
-	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
-	if (client->pg_inv_wq == NULL)
+	fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+	if (fsc->pg_inv_wq == NULL)
 		goto fail_wb_wq;
-	client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
-	if (client->trunc_wq == NULL)
+	fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+	if (fsc->trunc_wq == NULL)
 		goto fail_pg_inv_wq;
 
 	/* set up mempools */
 	err = -ENOMEM;
-	client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
-	if (!client->wb_pagevec_pool)
+	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+	if (!fsc->wb_pagevec_pool)
 		goto fail_trunc_wq;
 
 	/* caps */
-	client->min_caps = args->max_readdir;
+	fsc->min_caps = fsopt->max_readdir;
+
+	return fsc;
 
-	/* subsystems */
-	err = ceph_monc_init(&client->monc, client);
-	if (err < 0)
-		goto fail_mempool;
-	err = ceph_osdc_init(&client->osdc, client);
-	if (err < 0)
-		goto fail_monc;
-	err = ceph_mdsc_init(&client->mdsc, client);
-	if (err < 0)
-		goto fail_osdc;
-	return client;
-
-fail_osdc:
-	ceph_osdc_stop(&client->osdc);
-fail_monc:
-	ceph_monc_stop(&client->monc);
-fail_mempool:
-	mempool_destroy(client->wb_pagevec_pool);
 fail_trunc_wq:
-	destroy_workqueue(client->trunc_wq);
+	destroy_workqueue(fsc->trunc_wq);
 fail_pg_inv_wq:
-	destroy_workqueue(client->pg_inv_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
 fail_wb_wq:
-	destroy_workqueue(client->wb_wq);
+	destroy_workqueue(fsc->wb_wq);
 fail_bdi:
-	bdi_destroy(&client->backing_dev_info);
+	bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+	ceph_destroy_client(fsc->client);
 fail:
-	kfree(client);
+	kfree(fsc);
 	return ERR_PTR(err);
 }
 
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
 {
-	dout("destroy_client %p\n", client);
+	dout("destroy_fs_client %p\n", fsc);
 
-	/* unmount */
-	ceph_mdsc_stop(&client->mdsc);
-	ceph_osdc_stop(&client->osdc);
+	destroy_workqueue(fsc->wb_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
+	destroy_workqueue(fsc->trunc_wq);
 
-	/*
-	 * make sure mds and osd connections close out before destroying
-	 * the auth module, which is needed to free those connections'
-	 * ceph_authorizers.
-	 */
-	ceph_msgr_flush();
-
-	ceph_monc_stop(&client->monc);
+	bdi_destroy(&fsc->backing_dev_info);
 
-	ceph_debugfs_client_cleanup(client);
-	destroy_workqueue(client->wb_wq);
-	destroy_workqueue(client->pg_inv_wq);
-	destroy_workqueue(client->trunc_wq);
+	mempool_destroy(fsc->wb_pagevec_pool);
 
-	bdi_destroy(&client->backing_dev_info);
+	destroy_mount_options(fsc->mount_options);
 
-	if (client->msgr)
-		ceph_messenger_destroy(client->msgr);
-	mempool_destroy(client->wb_pagevec_pool);
+	ceph_fs_debugfs_cleanup(fsc);
 
-	destroy_mount_args(client->mount_args);
+	ceph_destroy_client(fsc->client);
 
-	kfree(client);
-	dout("destroy_client %p done\n", client);
+	kfree(fsc);
+	dout("destroy_fs_client %p done\n", fsc);
 }
 
 /*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
  */
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
 {
-	if (client->have_fsid) {
-		if (ceph_fsid_compare(&client->fsid, fsid)) {
-			pr_err("bad fsid, had %pU got %pU",
-			       &client->fsid, fsid);
-			return -1;
-		}
-	} else {
-		pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
-			fsid);
-		memcpy(&client->fsid, fsid, sizeof(*fsid));
-		ceph_debugfs_client_init(client);
-		client->have_fsid = true;
-	}
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
 	return 0;
+
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return -ENOMEM;
 }
 
+static void destroy_caches(void)
+{
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
+}
+
+
 /*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
 {
-	return client->monc.monmap && client->monc.monmap->epoch &&
-	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!fsc)
+		return;
+	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
 }
 
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.sync_fs        = ceph_sync_fs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
 /*
  * Bootstrap mount by opening the root directory.  Note the mount
  * @started time from caller, and time out if this takes too long.
  */
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 				       const char *path,
 				       unsigned long started)
 {
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req = NULL;
 	int err;
 	struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = client->mount_args->mount_timeout * HZ;
+	req->r_timeout = fsc->client->options->mount_timeout * HZ;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	if (err == 0) {
 		dout("open_root_inode success\n");
 		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-		    client->sb->s_root == NULL)
+		    fsc->sb->s_root == NULL)
 			root = d_alloc_root(req->r_target_inode);
 		else
 			root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
 	return root;
 }
 
+
+
+
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
 		      const char *path)
 {
-	struct ceph_entity_addr *myaddr = NULL;
 	int err;
-	unsigned long timeout = client->mount_args->mount_timeout * HZ;
 	unsigned long started = jiffies;  /* note the start time */
 	struct dentry *root;
+	int first = 0;   /* first vfsmount for this super_block */
 
 	dout("mount start\n");
-	mutex_lock(&client->mount_mutex);
-
-	/* initialize the messenger */
-	if (client->msgr == NULL) {
-		if (ceph_test_opt(client, MYIP))
-			myaddr = &client->mount_args->my_addr;
-		client->msgr = ceph_messenger_create(myaddr);
-		if (IS_ERR(client->msgr)) {
-			err = PTR_ERR(client->msgr);
-			client->msgr = NULL;
-			goto out;
-		}
-		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-	}
+	mutex_lock(&fsc->client->mount_mutex);
 
-	/* open session, and wait for mon, mds, and osd maps */
-	err = ceph_monc_open_session(&client->monc);
+	err = __ceph_open_session(fsc->client, started);
 	if (err < 0)
 		goto out;
 
-	while (!have_mon_and_osd_map(client)) {
-		err = -EIO;
-		if (timeout && time_after_eq(jiffies, started + timeout))
-			goto out;
-
-		/* wait */
-		dout("mount waiting for mon_map\n");
-		err = wait_event_interruptible_timeout(client->auth_wq,
-		       have_mon_and_osd_map(client) || (client->auth_err < 0),
-		       timeout);
-		if (err == -EINTR || err == -ERESTARTSYS)
-			goto out;
-		if (client->auth_err < 0) {
-			err = client->auth_err;
-			goto out;
-		}
-	}
-
 	dout("mount opening root\n");
-	root = open_root_dentry(client, "", started);
+	root = open_root_dentry(fsc, "", started);
 	if (IS_ERR(root)) {
 		err = PTR_ERR(root);
 		goto out;
 	}
-	if (client->sb->s_root)
+	if (fsc->sb->s_root) {
 		dput(root);
-	else
-		client->sb->s_root = root;
+	} else {
+		fsc->sb->s_root = root;
+		first = 1;
+
+		err = ceph_fs_debugfs_init(fsc);
+		if (err < 0)
+			goto fail;
+	}
 
 	if (path[0] == 0) {
 		dget(root);
 	} else {
 		dout("mount opening base mountpoint\n");
-		root = open_root_dentry(client, path, started);
+		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
 			err = PTR_ERR(root);
-			dput(client->sb->s_root);
-			client->sb->s_root = NULL;
-			goto out;
+			goto fail;
 		}
 	}
 
 	mnt->mnt_root = root;
-	mnt->mnt_sb = client->sb;
+	mnt->mnt_sb = fsc->sb;
 
-	client->mount_state = CEPH_MOUNT_MOUNTED;
+	fsc->mount_state = CEPH_MOUNT_MOUNTED;
 	dout("mount success\n");
 	err = 0;
 
 out:
-	mutex_unlock(&client->mount_mutex);
+	mutex_unlock(&fsc->client->mount_mutex);
 	return err;
+
+fail:
+	if (first) {
+		dput(fsc->sb->s_root);
+		fsc->sb->s_root = NULL;
+	}
+	goto out;
 }
 
 static int ceph_set_super(struct super_block *s, void *data)
 {
-	struct ceph_client *client = data;
+	struct ceph_fs_client *fsc = data;
 	int ret;
 
 	dout("set_super %p data %p\n", s, data);
 
-	s->s_flags = client->mount_args->sb_flags;
+	s->s_flags = fsc->mount_options->sb_flags;
 	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 
-	s->s_fs_info = client;
-	client->sb = s;
+	s->s_fs_info = fsc;
+	fsc->sb = s;
 
 	s->s_op = &ceph_super_ops;
 	s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 
 fail:
 	s->s_fs_info = NULL;
-	client->sb = NULL;
+	fsc->sb = NULL;
 	return ret;
 }
 
@@ -926,30 +732,23 @@ fail:
  */
 static int ceph_compare_super(struct super_block *sb, void *data)
 {
-	struct ceph_client *new = data;
-	struct ceph_mount_args *args = new->mount_args;
-	struct ceph_client *other = ceph_sb_to_client(sb);
-	int i;
+	struct ceph_fs_client *new = data;
+	struct ceph_mount_options *fsopt = new->mount_options;
+	struct ceph_options *opt = new->client->options;
+	struct ceph_fs_client *other = ceph_sb_to_client(sb);
 
 	dout("ceph_compare_super %p\n", sb);
-	if (args->flags & CEPH_OPT_FSID) {
-		if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
-			dout("fsid doesn't match\n");
-			return 0;
-		}
-	} else {
-		/* do we share (a) monitor? */
-		for (i = 0; i < new->monc.monmap->num_mon; i++)
-			if (ceph_monmap_contains(other->monc.monmap,
-					 &new->monc.monmap->mon_inst[i].addr))
-				break;
-		if (i == new->monc.monmap->num_mon) {
-			dout("mon ip not part of monmap\n");
-			return 0;
-		}
-		dout("mon ip matches existing sb %p\n", sb);
+
+	if (compare_mount_options(fsopt, opt, other)) {
+		dout("monitor(s)/mount options don't match\n");
+		return 0;
 	}
-	if (args->sb_flags != other->mount_args->sb_flags) {
+	if ((opt->flags & CEPH_OPT_FSID) &&
+	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+		dout("fsid doesn't match\n");
+		return 0;
+	}
+	if (fsopt->sb_flags != other->mount_options->sb_flags) {
 		dout("flags differ\n");
 		return 0;
 	}
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
  */
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+			     struct ceph_fs_client *fsc)
 {
 	int err;
 
 	/* set ra_pages based on rsize mount option? */
-	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
-		client->backing_dev_info.ra_pages =
-			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+		fsc->backing_dev_info.ra_pages =
+			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
-	err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
 			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
-		sb->s_bdi = &client->backing_dev_info;
+		sb->s_bdi = &fsc->backing_dev_info;
 	return err;
 }
 
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 		       struct vfsmount *mnt)
 {
 	struct super_block *sb;
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 	const char *path = NULL;
-	struct ceph_mount_args *args;
+	struct ceph_mount_options *fsopt = NULL;
+	struct ceph_options *opt = NULL;
 
 	dout("ceph_get_sb\n");
-	args = parse_mount_args(flags, data, dev_name, &path);
-	if (IS_ERR(args)) {
-		err = PTR_ERR(args);
+	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+	if (err < 0)
 		goto out_final;
-	}
 
 	/* create client (which we may/may not use) */
-	client = ceph_create_client(args);
-	if (IS_ERR(client)) {
-		err = PTR_ERR(client);
+	fsc = create_fs_client(fsopt, opt);
+	if (IS_ERR(fsc)) {
+		err = PTR_ERR(fsc);
+		kfree(fsopt);
+		kfree(opt);
 		goto out_final;
 	}
 
-	if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+	err = ceph_mdsc_init(fsc);
+	if (err < 0)
+		goto out;
+
+	if (ceph_test_opt(fsc->client, NOSHARE))
 		compare_super = NULL;
-	sb = sget(fs_type, compare_super, ceph_set_super, client);
+	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
 	if (IS_ERR(sb)) {
 		err = PTR_ERR(sb);
 		goto out;
 	}
 
-	if (ceph_sb_to_client(sb) != client) {
-		ceph_destroy_client(client);
-		client = ceph_sb_to_client(sb);
-		dout("get_sb got existing client %p\n", client);
+	if (ceph_sb_to_client(sb) != fsc) {
+		ceph_mdsc_destroy(fsc);
+		destroy_fs_client(fsc);
+		fsc = ceph_sb_to_client(sb);
+		dout("get_sb got existing client %p\n", fsc);
 	} else {
-		dout("get_sb using new client %p\n", client);
-		err = ceph_register_bdi(sb, client);
+		dout("get_sb using new client %p\n", fsc);
+		err = ceph_register_bdi(sb, fsc);
 		if (err < 0)
 			goto out_splat;
 	}
 
-	err = ceph_mount(client, mnt, path);
+	err = ceph_mount(fsc, mnt, path);
 	if (err < 0)
 		goto out_splat;
 	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 	return 0;
 
 out_splat:
-	ceph_mdsc_close_sessions(&client->mdsc);
+	ceph_mdsc_close_sessions(fsc->mdsc);
 	deactivate_locked_super(sb);
 	goto out_final;
 
 out:
-	ceph_destroy_client(client);
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
 out_final:
 	dout("ceph_get_sb fail %d\n", err);
 	return err;
@@ -1042,11 +849,12 @@ out_final:
 
 static void ceph_kill_sb(struct super_block *s)
 {
-	struct ceph_client *client = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 	dout("kill_sb %p\n", s);
-	ceph_mdsc_pre_umount(&client->mdsc);
+	ceph_mdsc_pre_umount(fsc->mdsc);
 	kill_anon_super(s);    /* will call put_super after sb is r/o */
-	ceph_destroy_client(client);
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
 }
 
 static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
 
 static int __init init_ceph(void)
 {
-	int ret = 0;
-
-	ret = ceph_debugfs_init();
-	if (ret < 0)
-		goto out;
-
-	ret = ceph_msgr_init();
-	if (ret < 0)
-		goto out_debugfs;
-
-	ret = init_caches();
+	int ret = init_caches();
 	if (ret)
-		goto out_msgr;
+		goto out;
 
 	ret = register_filesystem(&ceph_fs_type);
 	if (ret)
 		goto out_icache;
 
-	pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
-		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
 	return 0;
 
 out_icache:
 	destroy_caches();
-out_msgr:
-	ceph_msgr_exit();
-out_debugfs:
-	ceph_debugfs_cleanup();
 out:
 	return ret;
 }
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
 	dout("exit_ceph\n");
 	unregister_filesystem(&ceph_fs_type);
 	destroy_caches();
-	ceph_msgr_exit();
-	ceph_debugfs_cleanup();
 }
 
 module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4..1886294e12f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
 #ifndef _FS_CEPH_SUPER_H
 #define _FS_CEPH_SUPER_H
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <asm/unaligned.h>
 #include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
 #include <linux/writeback.h>
 #include <linux/slab.h>
 
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
+#include <linux/ceph/libceph.h>
 
 /* f_type in struct statfs */
 #define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
 #define CEPH_BLOCK_SHIFT   20  /* 1 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
-/*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
 
-/*
- * mount options
- */
-#define CEPH_OPT_FSID             (1<<0)
-#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
 
-#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+	(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
-#define ceph_set_opt(client, opt) \
-	(client)->mount_args->flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
-	(!!((client)->mount_args->flags & CEPH_OPT_##opt))
+#define CEPH_MAX_READDIR_DEFAULT        1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
 
-
-struct ceph_mount_args {
-	int sb_flags;
+struct ceph_mount_options {
 	int flags;
-	struct ceph_fsid fsid;
-	struct ceph_entity_addr my_addr;
-	int num_mon;
-	struct ceph_entity_addr *mon_addr;
-	int mount_timeout;
-	int osd_idle_ttl;
-	int osd_timeout;
-	int osd_keepalive_timeout;
+	int sb_flags;
+
 	int wsize;
 	int rsize;            /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
 	int cap_release_safety;
 	int max_readdir;       /* max readdir result (entires) */
 	int max_readdir_bytes; /* max readdir result (bytes) */
-	char *snapdir_name;   /* default ".snap" */
-	char *name;
-	char *secret;
-};
 
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT    1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
-
-#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT   "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
-	CEPH_MOUNT_MOUNTING,
-	CEPH_MOUNT_MOUNTED,
-	CEPH_MOUNT_UNMOUNTING,
-	CEPH_MOUNT_UNMOUNTED,
-	CEPH_MOUNT_SHUTDOWN,
-};
-
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-	BUG_ON(time_after(b, a));
-	return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
-	struct ceph_fsid fsid;
-	bool have_fsid;
+	/*
+	 * everything above this point can be memcmp'd; everything below
+	 * is handled in compare_mount_options()
+	 */
 
-	struct mutex mount_mutex;       /* serialize mount attempts */
-	struct ceph_mount_args *mount_args;
+	char *snapdir_name;   /* default ".snap" */
+};
 
+struct ceph_fs_client {
 	struct super_block *sb;
 
-	unsigned long mount_state;
-	wait_queue_head_t auth_wq;
-
-	int auth_err;
+	struct ceph_mount_options *mount_options;
+	struct ceph_client *client;
 
+	unsigned long mount_state;
 	int min_caps;                  /* min caps i added */
 
-	struct ceph_messenger *msgr;   /* messenger instance */
-	struct ceph_mon_client monc;
-	struct ceph_mds_client mdsc;
-	struct ceph_osd_client osdc;
+	struct ceph_mds_client *mdsc;
 
 	/* writeback */
 	mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
 	struct backing_dev_info backing_dev_info;
 
 #ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_monmap;
-	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_dentry_lru, *debugfs_caps;
 	struct dentry *debugfs_congestion_kb;
 	struct dentry *debugfs_bdi;
+	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
 #endif
 };
 
+
 /*
  * File i/o capability.  This tracks shared state with the metadata
  * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
 	int should_free_val;
 };
 
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
 struct ceph_inode_xattrs_info {
 	/*
 	 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
 /*
  * Ceph inode.
  */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
-#define CEPH_I_NODELAY   4  /* do not delay cap release */
-#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
-
 struct ceph_inode_info {
 	struct ceph_vino i_vino;   /* ceph ino + snap */
 
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
 	return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
 
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+	if (!ino)
+		ino = 1;
+#endif
+	return ino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
 static inline void ceph_i_clear(struct inode *inode, unsigned mask)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool r;
 
-	smp_mb();
+	spin_lock(&inode->i_lock);
 	r = (ci->i_ceph_flags & mask) == mask;
+	spin_unlock(&inode->i_lock);
 	return r;
 }
 
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 			    struct ceph_inode_frag *pfrag,
 			    int *found);
 
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
-	struct ceph_mds_session *lease_session;
-	u32 lease_gen, lease_shared_gen;
-	u32 lease_seq;
-	unsigned long lease_renew_after, lease_renew_from;
-	struct list_head lru;
-	struct dentry *dentry;
-	u64 time;
-	u64 offset;
-};
-
 static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 {
 	return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
 	return ((loff_t)frag << 32) | (loff_t)off;
 }
 
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
-	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-	if (!ino)
-		ino = 1;
-#endif
-	return ino;
-}
-
 static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 {
 	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 	return 0;
 }
 
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
-	struct ceph_vino *pvino = (struct ceph_vino *)data;
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	return ci->i_vino.ino == pvino->ino &&
-		ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
-					    struct ceph_vino vino)
-{
-	ino_t t = ceph_vino_to_ino(vino);
-	return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
 /*
  * caps helpers
  */
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 			       struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
 				    int *total, int *avail, int *used,
 				    int *reserved, int *min);
 
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
 {
-	return (struct ceph_client *)inode->i_sb->s_fs_info;
+	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
 }
 
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
 {
-	return (struct ceph_client *)sb->s_fs_info;
+	return (struct ceph_fs_client *)sb->s_fs_info;
 }
 
 
@@ -617,51 +541,6 @@ struct ceph_file_info {
 
 
 /*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data.  It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
-	atomic_t nref;
-	u64 seq;
-	int num_snaps;
-	u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-	/*
-	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)+1);
-	*/
-	if (sc)
-		atomic_inc(&sc->nref);
-	return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-	if (!sc)
-		return;
-	/*
-	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)-1);
-	*/
-	if (atomic_dec_and_test(&sc->nref)) {
-		/*printk(" deleting snap_context %p\n", sc);*/
-		kfree(sc);
-	}
-}
-
-/*
  * A "snap realm" describes a subset of the file hierarchy sharing
  * the same set of snapshots that apply to it.  The realms themselves
  * are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
 	spinlock_t inodes_with_caps_lock;
 };
 
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
+static inline int default_congestion_kb(void)
 {
-	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
-		(off >> PAGE_CACHE_SHIFT);
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
 }
 
 
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
 			   ci_item)->writing;
 }
 
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-
 /* inode.c */
 extern const struct inode_operations ceph_file_iops;
 
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 				       struct nameidata *nd, int mode,
 				       int locked_dir);
 extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
 
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* export.c */
 extern const struct export_operations ceph_export_ops;
 
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
-
 /* locks.c */
 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
 	return NULL;
 }
 
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
 #endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b7..6e12a6ba5f7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
 
 #include <linux/xattr.h>
 #include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			      const char *value, size_t size, int flags)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	int err;
 	int i, nr_pages;
 	struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
-	newname = kmalloc(name_len + 1, GFP_NOFS);
+	newname = kmemdup(name, name_len + 1, GFP_NOFS);
 	if (!newname)
 		goto out;
-	memcpy(newname, name, name_len + 1);
 
 	if (val_len) {
 		newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
 
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc966552214..c465ae066c6 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || LBDAF)
+	depends on (64BIT || LBDAF)
 	select DLM if GFS2_FS_LOCKING_DLM
 	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
 	select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d841..6b24afb96aa 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
 #include "glops.h"
 
 
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-				   unsigned int from, unsigned int to)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+			    unsigned int from, unsigned int to)
 {
 	struct buffer_head *head = page_buffers(page);
 	unsigned int bsize = head->b_size;
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	int alloc_required;
 	int error = 0;
-	struct gfs2_alloc *al;
+	struct gfs2_alloc *al = NULL;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned to = from + len;
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		rblocks += RES_STATFS + RES_QUOTA;
 	if (&ip->i_inode == sdp->sd_rindex)
 		rblocks += 2 * RES_STATFS;
+	if (alloc_required)
+		rblocks += gfs2_rg_blocks(al);
 
 	error = gfs2_trans_begin(sdp, rblocks,
 				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -696,13 +698,11 @@ out:
 
 	page_cache_release(page);
 
-	/*
-	 * XXX(truncate): the call below should probably be replaced with
-	 * a call to the gfs2-specific truncate blocks helper to actually
-	 * release disk blocks..
-	 */
+	gfs2_trans_end(sdp);
 	if (pos + len > ip->i_inode.i_size)
-		truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
+		gfs2_trim_blocks(&ip->i_inode);
+	goto out_trans_fail;
+
 out_endtrans:
 	gfs2_trans_end(sdp);
 out_trans_fail:
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	page_cache_release(page);
 
 	if (copied) {
-		if (inode->i_size < to) {
+		if (inode->i_size < to)
 			i_size_write(inode, to);
-			ip->i_disksize = inode->i_size;
-		}
 		gfs2_dinode_out(ip, di);
 		mark_inode_dirty(inode);
 	}
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 	if (ret > 0) {
-		if (inode->i_size > ip->i_disksize)
-			ip->i_disksize = inode->i_size;
 		gfs2_dinode_out(ip, dibh->b_data);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a..5476c066d4e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
  * @ip: the inode
  * @dibh: the dinode buffer
  * @block: the block number that was allocated
- * @private: any locked page held by the caller process
+ * @page: The (optional) page. This is looked up if @page is NULL
  *
  * Returns: errno
  */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 /**
  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
  * @ip: The GFS2 inode to unstuff
- * @unstuffer: the routine that handles unstuffing a non-zero length file
- * @private: private data for the unstuffer
+ * @page: The (optional) page. This is looked up if the @page is NULL
  *
  * This routine unstuffs a dinode and returns it to a "normal" state such
  * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	if (error)
 		goto out;
 
-	if (ip->i_disksize) {
+	if (i_size_read(&ip->i_inode)) {
 		/* Get a free block, fill it with the stuffed data,
 		   and write it out to disk */
 
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
-	if (ip->i_disksize) {
+	if (i_size_read(&ip->i_inode)) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
 }
 
 /**
- * do_grow - Make a file look bigger than it is
- * @ip: the inode
- * @size: the size to set the file to
- *
- * Called with an exclusive lock on @ip.
- *
- * Returns: errno
- */
-
-static int do_grow(struct gfs2_inode *ip, u64 size)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al;
-	struct buffer_head *dibh;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_lock_check(ip);
-	if (error)
-		goto out;
-
-	al->al_requested = sdp->sd_max_height + RES_DATA;
-
-	error = gfs2_inplace_reserve(ip);
-	if (error)
-		goto out_gunlock_q;
-
-	error = gfs2_trans_begin(sdp,
-			sdp->sd_max_height + al->al_rgd->rd_length +
-			RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-	if (error)
-		goto out_ipres;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out_end_trans;
-
-	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-		if (gfs2_is_stuffed(ip)) {
-			error = gfs2_unstuff_dinode(ip, NULL);
-			if (error)
-				goto out_brelse;
-		}
-	}
-
-	ip->i_disksize = size;
-	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-
-out_brelse:
-	brelse(dibh);
-out_end_trans:
-	gfs2_trans_end(sdp);
-out_ipres:
-	gfs2_inplace_release(ip);
-out_gunlock_q:
-	gfs2_quota_unlock(ip);
-out:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-
-/**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
  * This is partly borrowed from ext3.
  */
-static int gfs2_block_truncate_page(struct address_space *mapping)
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
-	loff_t from = inode->i_size;
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
 	return err;
 }
 
-static int trunc_start(struct gfs2_inode *ip, u64 size)
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
 	struct buffer_head *dibh;
 	int journaled = gfs2_is_jdata(ip);
 	int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 	if (error)
 		goto out;
 
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
 	if (gfs2_is_stuffed(ip)) {
-		u64 dsize = size + sizeof(struct gfs2_dinode);
-		ip->i_disksize = size;
-		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		if (dsize > dibh->b_size)
-			dsize = dibh->b_size;
-		gfs2_buffer_clear_tail(dibh, dsize);
-		error = 1;
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
 	} else {
-		if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
-			error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
-
-		if (!error) {
-			ip->i_disksize = size;
-			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-			ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
-			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-			gfs2_dinode_out(ip, dibh->b_data);
+		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+			error = gfs2_block_truncate_page(mapping, newsize);
+			if (error)
+				goto out_brelse;
 		}
+		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 	}
 
-	brelse(dibh);
+	i_size_write(inode, newsize);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
 
+	truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+	brelse(dibh);
 out:
 	gfs2_trans_end(sdp);
 	return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
 	if (error)
 		goto out;
 
-	if (!ip->i_disksize) {
+	if (!i_size_read(&ip->i_inode)) {
 		ip->i_height = 0;
 		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
 
 /**
  * do_shrink - make a file smaller
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
  *
- * Called with an exclusive lock on @ip.
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
  *
  * Returns: errno
  */
 
-static int do_shrink(struct gfs2_inode *ip, u64 size)
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	int error;
 
-	error = trunc_start(ip, size);
+	error = trunc_start(inode, oldsize, newsize);
 	if (error < 0)
 		return error;
-	if (error > 0)
+	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	error = trunc_dealloc(ip, size);
-	if (!error)
+	error = trunc_dealloc(ip, newsize);
+	if (error == 0)
 		error = trunc_end(ip);
 
 	return error;
 }
 
-static int do_touch(struct gfs2_inode *ip, u64 size)
+void gfs2_trim_blocks(struct inode *inode)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 size = inode->i_size;
+	int ret;
+
+	ret = do_shrink(inode, size, size);
+	WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occuring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
+	struct gfs2_alloc *al = NULL;
 	int error;
 
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (gfs2_is_stuffed(ip) &&
+	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+		al = gfs2_alloc_get(ip);
+		if (al == NULL)
+			return -ENOMEM;
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto do_grow_alloc_put;
+
+		al->al_requested = 1;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto do_grow_qunlock;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
 	if (error)
-		return error;
+		goto do_grow_release;
 
-	down_write(&ip->i_rw_mutex);
+	if (al) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			goto do_end_trans;
+	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
-		goto do_touch_out;
+		goto do_end_trans;
 
+	i_size_write(inode, size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
-do_touch_out:
-	up_write(&ip->i_rw_mutex);
+do_end_trans:
 	gfs2_trans_end(sdp);
+do_grow_release:
+	if (al) {
+		gfs2_inplace_release(ip);
+do_grow_qunlock:
+		gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+		gfs2_alloc_put(ip);
+	}
 	return error;
 }
 
 /**
- * gfs2_truncatei - make a file a given size
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
  *
- * The file size can grow, shrink, or stay the same size.
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
  *
  * Returns: errno
  */
 
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
 {
-	int error;
+	int ret;
+	u64 oldsize;
 
-	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
-		return -EINVAL;
+	BUG_ON(!S_ISREG(inode->i_mode));
 
-	if (size > ip->i_disksize)
-		error = do_grow(ip, size);
-	else if (size < ip->i_disksize)
-		error = do_shrink(ip, size);
-	else
-		/* update time stamps */
-		error = do_touch(ip, size);
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
 
-	return error;
+	oldsize = inode->i_size;
+	if (newsize >= oldsize)
+		return do_grow(inode, newsize);
+
+	return do_shrink(inode, oldsize, newsize);
 }
 
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
-	error = trunc_dealloc(ip, ip->i_disksize);
+	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
 	if (!error)
 		error = trunc_end(ip);
 	return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 
 	shift = sdp->sd_sb.sb_bsize_shift;
 	BUG_ON(gfs2_is_dir(ip));
-	end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
 	lblock = offset >> shift;
 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
 	if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135..42fea03e2bd 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
 	}
 }
 
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
-int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
-
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
-int gfs2_truncatei_resume(struct gfs2_inode *ip);
-int gfs2_file_dealloc(struct gfs2_inode *ip);
-int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-			      unsigned int len);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+			  struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+			   u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+				     unsigned int len);
 
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d..6798755b385 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 		ip = GFS2_I(inode);
 	}
 
-	if (sdp->sd_args.ar_localcaching)
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		goto valid;
 
 	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd..5c356d09c32 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
 
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
 typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
 			    u64 leaf_no, void *data);
 typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-	if (ip->i_disksize < offset + size)
-		ip->i_disksize = offset + size;
+	if (ip->i_inode.i_size < offset + size)
+		i_size_write(&ip->i_inode, offset + size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 
@@ -225,8 +228,8 @@ out:
 	if (error)
 		return error;
 
-	if (ip->i_disksize < offset + copied)
-		ip->i_disksize = offset + copied;
+	if (ip->i_inode.i_size < offset + copied)
+		i_size_write(&ip->i_inode, offset + copied);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 	unsigned int o;
 	int copied = 0;
 	int error = 0;
+	u64 disksize = i_size_read(&ip->i_inode);
 
-	if (offset >= ip->i_disksize)
+	if (offset >= disksize)
 		return 0;
 
-	if (offset + size > ip->i_disksize)
-		size = ip->i_disksize - offset;
+	if (offset + size > disksize)
+		size = disksize - offset;
 
 	if (!size)
 		return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 		unsigned hsize = 1 << ip->i_depth;
 		unsigned index;
 		u64 ln;
-		if (hsize * sizeof(u64) != ip->i_disksize) {
+		if (hsize * sizeof(u64) != i_size_read(inode)) {
 			gfs2_consist_inode(ip);
 			return ERR_PTR(-EIO);
 		}
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
 	for (x = sdp->sd_hash_ptrs; x--; lp++)
 		*lp = cpu_to_be64(bn);
 
-	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
+	i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
 	gfs2_add_inode_blocks(&dip->i_inode, 1);
 	dip->i_diskflags |= GFS2_DIF_EXHASH;
 
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	u64 *buf;
 	u64 *from, *to;
 	u64 block;
+	u64 disksize = i_size_read(&dip->i_inode);
 	int x;
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	if (!buf)
 		return -ENOMEM;
 
-	for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
+	for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
 		error = gfs2_dir_read_data(dip, (char *)buf,
 					    block * sdp->sd_hash_bsize,
 					    sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != i_size_read(inode)) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3b..a98f644bd3d 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
 
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
-int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-		   const struct gfs2_inode *ip);
-int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-		 const struct gfs2_inode *ip, unsigned int type);
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-		  filldir_t filldir);
-int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-		   const struct gfs2_inode *nip, unsigned int new_type);
+extern struct inode *gfs2_dir_search(struct inode *dir,
+				     const struct qstr *filename);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+			  const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+			const struct gfs2_inode *ip, unsigned int type);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+			 filldir_t filldir);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+			  const struct gfs2_inode *nip, unsigned int new_type);
 
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 
-int gfs2_diradd_alloc_required(struct inode *dir,
-			       const struct qstr *filename);
-int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
-			    struct buffer_head **bhp);
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+				      const struct qstr *filename);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+				   struct buffer_head **bhp);
 
 static inline u32 gfs2_disk_hash(const char *data, int len)
 {
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
 	memcpy(dent + 1, name->name, name->len);
 }
 
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
 #endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8a..06d582732d3 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
-	struct qstr dotdot;
 	struct dentry *dentry;
 
-	/*
-	 * XXX(hch): it would be a good idea to keep this around as a
-	 *	     static variable.
-	 */
-	gfs2_str2qstr(&dotdot, "..");
-
-	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
 	if (!IS_ERR(dentry))
 		dentry->d_op = &gfs2_dops;
 	return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c823..237ee6a940d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	rblocks = RES_DINODE + ind_blocks;
 	if (gfs2_is_jdata(ip))
 		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
+	if (ind_blocks || data_blocks) {
 		rblocks += RES_STATFS + RES_QUOTA;
+		rblocks += gfs2_rg_blocks(al);
+	}
 	ret = gfs2_trans_begin(sdp, rblocks, 0);
 	if (ret)
 		goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
 			goto fail;
 
 		if (!(file->f_flags & O_LARGEFILE) &&
-		    ip->i_disksize > MAX_NON_LFS) {
+		    i_size_read(inode) > MAX_NON_LFS) {
 			error = -EOVERFLOW;
 			goto fail_gunlock;
 		}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e0..87778857f09 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 		else
 			gfs2_glock_put_nolock(gl);
 	}
+	if (held1 && held2 && list_empty(&gl->gl_holders))
+		clear_bit(GLF_QUEUED, &gl->gl_flags);
 
 	gl->gl_state = new_state;
 	gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
 		if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
 			insert_pt = &gh2->gh_list;
 	}
+	set_bit(GLF_QUEUED, &gl->gl_flags);
 	if (likely(insert_pt == NULL)) {
 		list_add_tail(&gh->gh_list, &gl->gl_holders);
 		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 
 	gfs2_glock_hold(gl);
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
-	if (time_before(now, holdtime))
-		delay = holdtime - now;
-	if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
-		delay = gl->gl_ops->go_min_hold_time;
+	if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+			delay = gl->gl_ops->go_min_hold_time;
+	}
 
 	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
 	spin_unlock(&lru_lock);
 
 	spin_lock(&gl->gl_spin);
-	if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+	if (gl->gl_state != LM_ST_UNLOCKED)
 		handle_callback(gl, LM_ST_UNLOCKED, 0);
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 		*p++ = 'I';
 	if (test_bit(GLF_FROZEN, gflags))
 		*p++ = 'F';
+	if (test_bit(GLF_QUEUED, gflags))
+		*p++ = 'q';
 	*p = 0;
 	return buf;
 }
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
 	}
 #endif
 
-	glock_workqueue = create_workqueue("glock_workqueue");
+	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
-	gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+						WQ_FREEZEABLE, 0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
 		destroy_workqueue(glock_workqueue);
 		return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b15..db1c26d6d22 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
- * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
  * @gl: the glock
  * @state: the state we're requesting
  * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb69..0d149dcc04e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	const struct gfs2_inode *ip = gl->gl_object;
 	if (ip == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
 		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
 		  (unsigned int)ip->i_diskflags,
-		  (unsigned long long)ip->i_inode.i_size,
-		  (unsigned long long)ip->i_disksize);
+		  (unsigned long long)i_size_read(&ip->i_inode));
 	return 0;
 }
 
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
 	[LM_TYPE_META] = &gfs2_meta_glops,
 	[LM_TYPE_INODE] = &gfs2_inode_glops,
 	[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
-	[LM_TYPE_NONDISK] = &gfs2_trans_glops,
 	[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
 	[LM_TYPE_FLOCK] = &gfs2_flock_glops,
 	[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa..764fbb49efc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
 	GLF_REPLY_PENDING		= 9,
 	GLF_INITIAL			= 10,
 	GLF_FROZEN			= 11,
+	GLF_QUEUED			= 12,
 };
 
 struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
 	u64 i_no_formal_ino;
 	u64 i_generation;
 	u64 i_eattr;
-	loff_t i_disksize;
 	unsigned long i_flags;		/* GIF_... */
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
 	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
 	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
 	unsigned int ar_spectator:1;		/* Don't get a journal */
-	unsigned int ar_ignore_local_fs:1;	/* Ignore optimisations */
 	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
-	unsigned int ar_localcaching:1;		/* Local caching */
 	unsigned int ar_debug:1;		/* Oops on errors */
-	unsigned int ar_upgrade:1;		/* Upgrade ondisk format */
 	unsigned int ar_posix_acl:1;		/* Enable posix acls */
 	unsigned int ar_quota:2;		/* off/account/on */
 	unsigned int ar_suiddir:1;		/* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
  */
 
 struct lm_lockstruct {
-	unsigned int ls_jid;
+	int ls_jid;
 	unsigned int ls_first;
 	unsigned int ls_first_done;
 	unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
 	struct list_head sd_rindex_mru_list;
 	struct gfs2_rgrpd *sd_rindex_forward;
 	unsigned int sd_rgrps;
+	unsigned int sd_max_rg_data;
 
 	/* Journal index stuff */
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a3..06370f8bd8c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	 * to do that.
 	 */
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-	ip->i_disksize = be64_to_cpu(str->di_size);
-	i_size_write(&ip->i_inode, ip->i_disksize);
+	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
 	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-	str->di_size = cpu_to_be64(ip->i_disksize);
+	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
 	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	       (unsigned long long)ip->i_no_formal_ino);
 	printk(KERN_INFO "  no_addr = %llu\n",
 	       (unsigned long long)ip->i_no_addr);
-	printk(KERN_INFO "  i_disksize = %llu\n",
-	       (unsigned long long)ip->i_disksize);
+	printk(KERN_INFO "  i_size = %llu\n",
+	       (unsigned long long)i_size_read(&ip->i_inode));
 	printk(KERN_INFO "  blocks = %llu\n",
 	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
 	printk(KERN_INFO "  i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21d..6720d7d5fbc 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
 extern int gfs2_internal_read(struct gfs2_inode *ip,
 			      struct file_ra_state *ra_state,
 			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to);
 extern void gfs2_set_aops(struct inode *inode);
 
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 	dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
 }
 
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+						u64 minsize, u64 maxsize)
+{
+	u64 size = i_size_read(inode);
+	if (size < minsize || size > maxsize)
+		goto err;
+	if (size & ((1 << inode->i_blkbits) - 1))
+		goto err;
+	return 0;
+err:
+	gfs2_consist_inode(GFS2_I(inode));
+	return -EIO;
+}
 
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c..1c09425b45f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
 		ret |= LM_OUT_CANCELED;
 		goto out;
 	case -EAGAIN: /* Try lock fails */
+	case -EDEADLK: /* Deadlock detected */
 		goto out;
-	case -EINVAL: /* Invalid */
-	case -ENOMEM: /* Out of memory */
+	case -ETIMEDOUT: /* Canceled due to timeout */
 		ret |= LM_OUT_ERROR;
 		goto out;
 	case 0: /* Success */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46..d7eb1e209aa 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
 #include "glock.h"
 #include "quota.h"
 #include "recovery.h"
+#include "dir.h"
 
 static struct shrinker qd_shrinker = {
 	.shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
 {
 	int error;
 
+	gfs2_str2qstr(&gfs2_qdot, ".");
+	gfs2_str2qstr(&gfs2_qdotdot, "..");
+
 	error = gfs2_sys_init();
 	if (error)
 		return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
 
 	error = -ENOMEM;
 	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-					  WQ_NON_REENTRANT | WQ_RESCUER, 0);
+					  WQ_RESCUER | WQ_FREEZEABLE, 0);
 	if (!gfs_recovery_wq)
 		goto fail_wq;
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64..aeafc233dc8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
 #define DO 0
 #define UNDO 1
 
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-
 /**
  * gfs2_tune_init - Fill a gfs2_tune structure with default values
  * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
-	unsigned int x;
-
 	if (sb->sb_magic != GFS2_MAGIC ||
 	    sb->sb_type != GFS2_METATYPE_SB) {
 		if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
 	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
 		return 0;
 
-	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-		for (x = 0; gfs2_old_fs_formats[x]; x++)
-			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-				break;
+	fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
 
-		if (!gfs2_old_fs_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-		for (x = 0; gfs2_old_multihost_formats[x]; x++)
-			if (gfs2_old_multihost_formats[x] ==
-			    sb->sb_multihost_format)
-				break;
-
-		if (!gfs2_old_multihost_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (!sdp->sd_args.ar_upgrade) {
-		printk(KERN_WARNING
-		       "GFS2: code version (%u, %u) is incompatible "
-		       "with ondisk format (%u, %u)\n",
-		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-		       sb->sb_fs_format, sb->sb_multihost_format);
-		printk(KERN_INFO
-		       "GFS2: Use the \"upgrade\" mount option to upgrade "
-		       "the FS\n");
-		printk(KERN_INFO "GFS2: See the manual for more details\n");
-		return -EINVAL;
-	}
-
-	return 0;
+	return -EINVAL;
 }
 
 static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 
 	prev_db = 0;
 
-	for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
+	for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
 		bh.b_state = 0;
 		bh.b_blocknr = 0;
 		bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 	if (!strcmp("lock_nolock", proto)) {
 		lm = &nolock_ops;
 		sdp->sd_args.ar_localflocks = 1;
-		sdp->sd_args.ar_localcaching = 1;
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 	} else if (!strcmp("lock_dlm", proto)) {
 		lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
 
 static int wait_on_journal(struct gfs2_sbd *sdp)
 {
-	if (sdp->sd_args.ar_spectator)
-		return 0;
 	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		return 0;
 
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	if (error)
 		goto fail_sb;
 
+	/*
+	 * If user space has failed to join the cluster or some similar
+	 * failure has occurred, then the journal id will contain a
+	 * negative (error) number. This will then be returned to the
+	 * caller (of the mount syscall). We do this even for spectator
+	 * mounts (which just write a jid of 0 to indicate "ok" even though
+	 * the jid is unused in the spectator case)
+	 */
+	if (sdp->sd_lockstruct.ls_jid < 0) {
+		error = sdp->sd_lockstruct.ls_jid;
+		sdp->sd_lockstruct.ls_jid = 0;
+		goto fail_sb;
+	}
+
 	error = init_inodes(sdp, DO);
 	if (error)
 		goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c973..0534510200d 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 al->al_rgd->rd_length +
+					 gfs2_rg_blocks(al) +
 					 2 * RES_DINODE + RES_STATFS +
 					 RES_QUOTA, 0);
 		if (error)
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 	ip = ghs[1].gh_gl->gl_object;
 
-	ip->i_disksize = size;
 	i_size_write(inode, size);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ip = ghs[1].gh_gl->gl_object;
 
 	ip->i_inode.i_nlink = 2;
-	ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
 	ip->i_diskflags |= GFS2_DIF_JDATA;
 	ip->i_entries = 2;
 
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (!gfs2_assert_withdraw(sdp, !error)) {
 		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
 		struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-		struct qstr str;
 
-		gfs2_str2qstr(&str, ".");
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+		gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
 		dent->de_inum = di->di_num; /* already GFS2 endian */
 		dent->de_type = cpu_to_be16(DT_DIR);
 		di->di_entries = cpu_to_be32(1);
 
-		gfs2_str2qstr(&str, "..");
 		dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-		gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+		gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
 
 		gfs2_inum_out(dip, dent);
 		dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 		       struct gfs2_inode *ip)
 {
-	struct qstr dotname;
 	int error;
 
 	if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 	if (error)
 		return error;
 
-	gfs2_str2qstr(&dotname, ".");
-	error = gfs2_dir_del(ip, &dotname);
+	error = gfs2_dir_del(ip, &gfs2_qdot);
 	if (error)
 		return error;
 
-	gfs2_str2qstr(&dotname, "..");
-	error = gfs2_dir_del(ip, &dotname);
+	error = gfs2_dir_del(ip, &gfs2_qdotdot);
 	if (error)
 		return error;
 
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 	struct inode *dir = &to->i_inode;
 	struct super_block *sb = dir->i_sb;
 	struct inode *tmp;
-	struct qstr dotdot;
 	int error = 0;
 
-	gfs2_str2qstr(&dotdot, "..");
-
 	igrab(dir);
 
 	for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 			break;
 		}
 
-		tmp = gfs2_lookupi(dir, &dotdot, 1);
+		tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
 		if (IS_ERR(tmp)) {
 			error = PTR_ERR(tmp);
 			break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
 	struct gfs2_inode *nip = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			return 0;
 	}
 
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		return error;
 
 	if (odip != ndip) {
 		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 
 		al->al_requested = sdp->sd_max_dirres;
 
-		error = gfs2_inplace_reserve(ndip);
+		error = gfs2_inplace_reserve_ri(ndip);
 		if (error)
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 al->al_rgd->rd_length +
+					 gfs2_rg_blocks(al) +
 					 4 * RES_DINODE + 4 * RES_LEAF +
 					 RES_STATFS + RES_QUOTA + 4, 0);
 		if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	}
 
 	if (dir_rename) {
-		struct qstr name;
-		gfs2_str2qstr(&name, "..");
-
 		error = gfs2_change_nlink(ndip, +1);
 		if (error)
 			goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_end_trans;
 
-		error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
+		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
 		if (error)
 			goto out_end_trans;
 	} else {
@@ -972,6 +964,7 @@ out_gunlock_r:
 	if (r_gh.gh_gl)
 		gfs2_glock_dq_uninit(&r_gh);
 out:
+	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
 
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
 	struct gfs2_holder i_gh;
 	struct buffer_head *dibh;
-	unsigned int x;
+	unsigned int x, size;
 	char *buf;
 	int error;
 
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 		return NULL;
 	}
 
-	if (!ip->i_disksize) {
+	size = (unsigned int)i_size_read(&ip->i_inode);
+	if (size == 0) {
 		gfs2_consist_inode(ip);
 		buf = ERR_PTR(-EIO);
 		goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 		goto out;
 	}
 
-	x = ip->i_disksize + 1;
+	x = size + 1;
 	buf = kmalloc(x, GFP_NOFS);
 	if (!buf)
 		buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
 	return error;
 }
 
-/*
- * XXX(truncate): the truncate_setsize calls should be moved to the end.
- */
-static int setattr_size(struct inode *inode, struct iattr *attr)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int error;
-
-	if (attr->ia_size != ip->i_disksize) {
-		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-		if (error)
-			return error;
-		truncate_setsize(inode, attr->ia_size);
-		gfs2_trans_end(sdp);
-	}
-
-	error = gfs2_truncatei(ip, attr->ia_size);
-	if (error && (inode->i_size != ip->i_disksize))
-		i_size_write(inode, ip->i_disksize);
-
-	return error;
-}
-
 static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		goto out;
 
 	if (attr->ia_valid & ATTR_SIZE)
-		error = setattr_size(inode, attr);
+		error = gfs2_setattr_size(inode, attr->ia_size);
 	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
 		error = setattr_chown(inode, attr);
 	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return ret;
 }
 
+static void empty_write_end(struct page *page, unsigned from,
+			   unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+	page_zero_new_buffers(page, from, to);
+	flush_dcache_page(page);
+	mark_page_accessed(page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	block_commit_write(page, from, to);
+}
+
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	unsigned start, end, next;
+	struct buffer_head *bh, *head;
+	int error;
+
+	if (!page_has_buffers(page)) {
+		error = block_prepare_write(page, from, to, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+
+		empty_write_end(page, from, to);
+		return 0;
+	}
+
+	bh = head = page_buffers(page);
+	next = end = 0;
+	while (next < from) {
+		next += bh->b_size;
+		bh = bh->b_this_page;
+	}
+	start = next;
+	do {
+		next += bh->b_size;
+		if (buffer_mapped(bh)) {
+			if (end) {
+				error = block_prepare_write(page, start, end,
+							    gfs2_block_map);
+				if (unlikely(error))
+					return error;
+				empty_write_end(page, start, end);
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		bh = bh->b_this_page;
+	} while (next < to);
+
+	if (end) {
+		error = block_prepare_write(page, start, end, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+		empty_write_end(page, start, end);
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+		 sdp->sd_sb.sb_bsize_shift;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(al);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fallocate = gfs2_fallocate,
 	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6..58a9b9998b4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
 		goto out;
 
 	size = loc + sizeof(struct gfs2_quota);
-	if (size > inode->i_size) {
-		ip->i_disksize = size;
+	if (size > inode->i_size)
 		i_size_write(inode, size);
-	}
 	inode->i_mtime = inode->i_atime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		goto out_alloc;
 
 	if (nalloc)
-		blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+		blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
 
 	error = gfs2_trans_begin(sdp, blocks, 0);
 	if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-	unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
+	u64 size = i_size_read(sdp->sd_qc_inode);
+	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
 
-	if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-	    ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
-		gfs2_consist_inode(ip);
+	if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
 		return -EIO;
-	}
+
 	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
 	sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
 
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 		error = gfs2_inplace_reserve(ip);
 		if (error)
 			goto out_alloc;
+		blocks += gfs2_rg_blocks(al);
 	}
 
 	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a..f2a02edcac8 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
 	int ro = 0;
 	unsigned int pass;
 	int error;
+	int jlocked = 0;
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+	if (sdp->sd_args.ar_spectator ||
+	    (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
 		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
 			jd->jd_jid);
-
+		jlocked = 1;
 		/* Acquire the journal lock so we can do recovery */
 
 		error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
 			jd->jd_jid, t);
 	}
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
-		gfs2_glock_dq_uninit(&ji_gh);
-
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+	if (jlocked) {
+		gfs2_glock_dq_uninit(&ji_gh);
 		gfs2_glock_dq_uninit(&j_gh);
+	}
 
 	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
 	goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
 fail_gunlock_tr:
 	gfs2_glock_dq_uninit(&t_gh);
 fail_gunlock_ji:
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+	if (jlocked) {
 		gfs2_glock_dq_uninit(&ji_gh);
 fail_gunlock_j:
 		gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e4..fb67f593f40 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	for (rgrps = 0;; rgrps++) {
 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-		if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+		if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
 			break;
 		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
 					   sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
-	u64 rgrp_count = ip->i_disksize;
+	u64 rgrp_count = i_size_read(inode);
+	struct gfs2_rgrpd *rgd;
+	unsigned int max_data = 0;
 	int error;
 
 	do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 		}
 	}
 
+	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+		if (rgd->rd_data > max_data)
+			max_data = rgd->rd_data;
+	sdp->sd_max_rg_data = max_data;
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
 }
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
+	struct gfs2_rgrpd *rgd;
+	unsigned int max_data = 0;
 	int error;
 
 	file_ra_state_init(&ra_state, inode->i_mapping);
 	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
 		/* Ignore partials */
 		if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-		    ip->i_disksize)
+		    i_size_read(inode))
 			break;
 		error = read_rindex_entry(ip, &ra_state);
 		if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 			return error;
 		}
 	}
+	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+		if (rgd->rd_data > max_data)
+			max_data = rgd->rd_data;
+	sdp->sd_max_rg_data = max_data;
 
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
@@ -1188,7 +1200,8 @@ out:
  * Returns: errno
  */
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+			   char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 		return -EINVAL;
 
 try_again:
-	/* We need to hold the rindex unless the inode we're using is
-	   the rindex itself, in which case it's already held. */
-	if (ip != GFS2_I(sdp->sd_rindex))
-		error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-	else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
-		error = gfs2_ri_update_special(ip);
+	if (hold_rindex) {
+		/* We need to hold the rindex unless the inode we're using is
+		   the rindex itself, in which case it's already held. */
+		if (ip != GFS2_I(sdp->sd_rindex))
+			error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+		else if (!sdp->sd_rgrps) /* We may not have the rindex read
+					    in, so: */
+			error = gfs2_ri_update_special(ip);
+	}
 
 	if (error)
 		return error;
@@ -1215,7 +1231,7 @@ try_again:
 	   try to free it, and try the allocation again. */
 	error = get_local_rgrp(ip, &unlinked, &last_unlinked);
 	if (error) {
-		if (ip != GFS2_I(sdp->sd_rindex))
+		if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
 			gfs2_glock_dq_uninit(&al->al_ri_gh);
 		if (error != -EAGAIN)
 			return error;
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 	al->al_rgd = NULL;
 	if (al->al_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_rgd_gh);
-	if (ip != GFS2_I(sdp->sd_rindex))
+	if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
 
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_rgrpd *rgd = al->al_rgd;
+	struct gfs2_rgrpd *rgd;
 	u32 goal, blk;
 	u64 block;
 	int error;
 
+	/* Only happens if there is a bug in gfs2, return something distinctive
+	 * to ensure that it is noticed.
+	 */
+	if (al == NULL)
+		return -ECANCELED;
+
+	rgd = al->al_rgd;
+
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
 	else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d8955..0e35c0466f9 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-				  unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+				  char *file, unsigned int line);
 #define gfs2_inplace_reserve(ip) \
-gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+	gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+	gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee..047d1176096 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
 	{Opt_locktable, "locktable=%s"},
 	{Opt_hostdata, "hostdata=%s"},
 	{Opt_spectator, "spectator"},
+	{Opt_spectator, "norecovery"},
 	{Opt_ignore_local_fs, "ignore_local_fs"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 			args->ar_spectator = 1;
 			break;
 		case Opt_ignore_local_fs:
-			args->ar_ignore_local_fs = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_localflocks:
 			args->ar_localflocks = 1;
 			break;
 		case Opt_localcaching:
-			args->ar_localcaching = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_debug:
 			if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 			args->ar_debug = 0;
 			break;
 		case Opt_upgrade:
-			args->ar_upgrade = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_acl:
 			args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	u64 size = i_size_read(jd->jd_inode);
 
-	if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-	    (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
-		gfs2_consist_inode(ip);
+	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
 		return -EIO;
-	}
-	jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
 
-	if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
+	jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+	if (gfs2_write_alloc_required(ip, 0, size)) {
 		gfs2_consist_inode(ip);
 		return -EIO;
 	}
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	/* Some flags must not be changed */
 	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
 	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, localcaching) ||
 	    args_neq(&args, &sdp->sd_args, meta))
 		return -EINVAL;
 
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
 	if (args->ar_spectator)
 		seq_printf(s, ",spectator");
-	if (args->ar_ignore_local_fs)
-		seq_printf(s, ",ignore_local_fs");
 	if (args->ar_localflocks)
 		seq_printf(s, ",localflocks");
-	if (args->ar_localcaching)
-		seq_printf(s, ",localcaching");
 	if (args->ar_debug)
 		seq_printf(s, ",debug");
-	if (args->ar_upgrade)
-		seq_printf(s, ",upgrade");
 	if (args->ar_posix_acl)
 		seq_printf(s, ",acl");
 	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faa..748ccb557c1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 
 	if (gltype > LM_TYPE_JOURNAL)
 		return -EINVAL;
-	glops = gfs2_glops_list[gltype];
+	if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+		glops = &gfs2_trans_glops;
+	else
+		glops = gfs2_glops_list[gltype];
 	if (glops == NULL)
 		return -EINVAL;
 	if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 
 static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
 {
-	return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+	return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
 }
 
 static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
-        unsigned jid;
+        int jid;
 	int rv;
 
-	rv = sscanf(buf, "%u", &jid);
+	rv = sscanf(buf, "%d", &jid);
 	if (rv != 1)
 		return -EINVAL;
 
 	spin_lock(&sdp->sd_jindex_spin);
 	rv = -EINVAL;
-	if (sdp->sd_args.ar_spectator)
-		goto out;
 	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		goto out;
 	rv = -EBUSY;
-	if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
 		goto out;
+	rv = 0;
+	if (sdp->sd_args.ar_spectator && jid > 0)
+		rv = jid = -EINVAL;
 	sdp->sd_lockstruct.ls_jid = jid;
+	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
-	rv = 0;
 out:
 	spin_unlock(&sdp->sd_jindex_spin);
 	return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
 	if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
-		add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
+		add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
 	if (gfs2_uuid_valid(uuid))
 		add_uevent_var(env, "UUID=%pUB", uuid);
 	return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c1417..cedb0bb96d9 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
 	{(1UL << GLF_INVALIDATE_IN_PROGRESS),	"i" },		\
 	{(1UL << GLF_REPLY_PENDING),		"r" },		\
 	{(1UL << GLF_INITIAL),			"I" },		\
-	{(1UL << GLF_FROZEN),			"F" })
+	{(1UL << GLF_FROZEN),			"F" },		\
+	{(1UL << GLF_QUEUED),			"q" })
 
 #ifndef NUMPTY
 #define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908..fb56b783e02 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
 #define RES_JDATA	1
 #define RES_DATA	1
 #define RES_LEAF	1
+#define RES_RG_HDR	1
 #define RES_RG_BIT	2
 #define RES_EATTR	1
 #define RES_STATFS	1
 #define RES_QUOTA	2
 
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+	return (al->al_requested < al->al_rgd->rd_length)?
+	       al->al_requested + 1 : al->al_rgd->rd_length;
+}
+
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 		     unsigned int revokes);
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bc..30b58f07c8a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		goto out_gunlock_q;
 
 	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-				 blks + al->al_rgd->rd_length +
+				 blks + gfs2_rg_blocks(al) +
 				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
 	if (error)
 		goto out_ipres;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d..571abe97b42 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
 	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-	down(&tree->tree_lock);
+	mutex_lock(&tree->tree_lock);
 	return 0;
 }
 
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
 	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-	up(&fd->tree->tree_lock);
+	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
 
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7..3ebc437736f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	if (!tree)
 		return NULL;
 
-	init_MUTEX(&tree->tree_lock);
+	mutex_init(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
 	/* Set the correct compare function */
 	tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21..2a1d712f85d 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
 	unsigned int depth;
 
 	//unsigned int map1_size, map_size;
-	struct semaphore tree_lock;
+	struct mutex tree_lock;
 
 	unsigned int pages_per_bnode;
 	spinlock_t hash_lock;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be..d182438c7ae 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
 	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-	down(&tree->tree_lock);
+	mutex_lock(&tree->tree_lock);
 	return 0;
 }
 
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
 	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-	up(&fd->tree->tree_lock);
+	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
 
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 		rec = (e + b) / 2;
 		len = hfs_brec_lenoff(bnode, rec, &off);
 		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
 		if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 	if (rec != e && e >= 0) {
 		len = hfs_brec_lenoff(bnode, e, &off);
 		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 	}
 done:
@@ -75,6 +83,7 @@ done:
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
 	fd->entrylength = len - keylen;
+fail:
 	return res;
 }
 
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
 
 	len = hfs_brec_lenoff(bnode, fd->record, &off);
 	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == 0) {
+		res = -EINVAL;
+		goto out;
+	}
 	fd->keyoffset = off;
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03..ad57f5991eb 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
 
 int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct page *page;
 	struct address_space *mapping;
 	__be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		return size;
 
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
 	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	if (IS_ERR(page)) {
 		start = size;
@@ -150,16 +151,17 @@ done:
 	set_page_dirty(page);
 	kunmap(page);
 	*max = offset + (curr - pptr) * 32 + i - start;
-	HFSPLUS_SB(sb).free_blocks -= *max;
+	sbi->free_blocks -= *max;
 	sb->s_dirt = 1;
 	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
 out:
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_unlock(&sbi->alloc_mutex);
 	return start;
 }
 
 int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct page *page;
 	struct address_space *mapping;
 	__be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 
 	dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
 	/* are all of the bits in range? */
-	if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
+	if ((offset + count) > sbi->total_blocks)
 		return -2;
 
-	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
 	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
 out:
 	set_page_dirty(page);
 	kunmap(page);
-	HFSPLUS_SB(sb).free_blocks += len;
+	sbi->free_blocks += len;
 	sb->s_dirt = 1;
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_unlock(&sbi->alloc_mutex);
 
 	return 0;
 }
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a40..2f39d05443e 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
 		if (!recoff)
 			return 0;
-		if (node->tree->attributes & HFS_TREE_BIGKEYS)
-			retval = hfs_bnode_read_u16(node, recoff) + 2;
-		else
-			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+
+		retval = hfs_bnode_read_u16(node, recoff) + 2;
+		if (retval > node->tree->max_key_len + 2) {
+			printk(KERN_ERR "hfs: keylen %d too large\n",
+				retval);
+			retval = 0;
+		}
 	}
 	return retval;
 }
@@ -216,7 +219,7 @@ skip:
 static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 {
 	struct hfs_btree *tree;
-	struct hfs_bnode *node, *new_node;
+	struct hfs_bnode *node, *new_node, *next_node;
 	struct hfs_bnode_desc node_desc;
 	int num_recs, new_rec_off, new_off, old_rec_off;
 	int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	new_node->type = node->type;
 	new_node->height = node->height;
 
+	if (node->next)
+		next_node = hfs_bnode_find(tree, node->next);
+	else
+		next_node = NULL;
+
+	if (IS_ERR(next_node)) {
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		return next_node;
+	}
+
 	size = tree->node_size / 2 - node->num_recs * 2 - 14;
 	old_rec_off = tree->node_size - 4;
 	num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 		/* panic? */
 		hfs_bnode_put(node);
 		hfs_bnode_put(new_node);
+		if (next_node)
+			hfs_bnode_put(next_node);
 		return ERR_PTR(-ENOSPC);
 	}
 
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
 
 	/* update next bnode header */
-	if (new_node->next) {
-		struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
+	if (next_node) {
 		next_node->prev = new_node->this;
 		hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
 		node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e29..22e4d4e3299 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	if (!tree)
 		return NULL;
 
-	init_MUTEX(&tree->tree_lock);
+	mutex_init(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
 	tree->sb = sb;
 	tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 	tree->inode = inode;
 
+	if (!HFSPLUS_I(tree->inode)->first_blocks) {
+		printk(KERN_ERR
+		       "hfs: invalid btree extent records (0 size).\n");
+		goto free_inode;
+	}
+
 	mapping = tree->inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-		goto free_tree;
+		goto free_inode;
 
 	/* Load the header */
 	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	tree->max_key_len = be16_to_cpu(head->max_key_len);
 	tree->depth = be16_to_cpu(head->depth);
 
-	/* Set the correct compare function */
-	if (id == HFSPLUS_EXT_CNID) {
+	/* Verify the tree and set the correct compare function */
+	switch (id) {
+	case HFSPLUS_EXT_CNID:
+		if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+			printk(KERN_ERR "hfs: invalid extent btree flag\n");
+			goto fail_page;
+		}
+
 		tree->keycmp = hfsplus_ext_cmp_key;
-	} else if (id == HFSPLUS_CAT_CNID) {
-		if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
+		break;
+	case HFSPLUS_CAT_CNID:
+		if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+			goto fail_page;
+		}
+
+		if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
 		    (head->key_type == HFSPLUS_KEY_BINARY))
 			tree->keycmp = hfsplus_cat_bin_cmp_key;
 		else {
 			tree->keycmp = hfsplus_cat_case_cmp_key;
-			HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 		}
-	} else {
+		break;
+	default:
 		printk(KERN_ERR "hfs: unknown B*Tree requested\n");
 		goto fail_page;
 	}
 
+	if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+		printk(KERN_ERR "hfs: invalid btree flag\n");
+		goto fail_page;
+	}
+
 	size = tree->node_size;
 	if (!is_power_of_2(size))
 		goto fail_page;
 	if (!tree->node_count)
 		goto fail_page;
+
 	tree->node_size_shift = ffs(size) - 1;
 
 	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	return tree;
 
  fail_page:
-	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	page_cache_release(page);
- free_tree:
+ free_inode:
+	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	iput(tree->inode);
+ free_tree:
 	kfree(tree);
 	return NULL;
 }
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 
 	while (!tree->free_nodes) {
 		struct inode *inode = tree->inode;
+		struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 		u32 count;
 		int res;
 
 		res = hfsplus_file_extend(inode);
 		if (res)
 			return ERR_PTR(res);
-		HFSPLUS_I(inode).phys_size = inode->i_size =
-				(loff_t)HFSPLUS_I(inode).alloc_blocks <<
-				HFSPLUS_SB(tree->sb).alloc_blksz_shift;
-		HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
-					     HFSPLUS_SB(tree->sb).fs_shift;
+		hip->phys_size = inode->i_size =
+			(loff_t)hip->alloc_blocks <<
+				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+		hip->fs_blocks =
+			hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
 		inode_set_bytes(inode, inode->i_size);
 		count = inode->i_size >> tree->node_size_shift;
 		tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf..8af45fc5b05 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
 	key->key_len = cpu_to_be16(6 + ustrlen);
 }
 
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 {
 	if (inode->i_flags & S_IMMUTABLE)
 		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 		perms->rootflags |= HFSPLUS_FLG_APPEND;
 	else
 		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	HFSPLUS_I(inode).rootflags = perms->rootflags;
-	HFSPLUS_I(inode).userflags = perms->userflags;
+
+	perms->userflags = HFSPLUS_I(inode)->userflags;
 	perms->mode = cpu_to_be16(inode->i_mode);
 	perms->owner = cpu_to_be32(inode->i_uid);
 	perms->group = cpu_to_be32(inode->i_gid);
+
+	if (S_ISREG(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_nlink);
+	else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_rdev);
+	else
+		perms->dev = 0;
 }
 
 static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
 	if (S_ISDIR(inode->i_mode)) {
 		struct hfsplus_cat_folder *folder;
 
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 		memset(folder, 0, sizeof(*folder));
 		folder->type = cpu_to_be16(HFSPLUS_FOLDER);
 		folder->id = cpu_to_be32(inode->i_ino);
-		HFSPLUS_I(inode).create_date =
+		HFSPLUS_I(inode)->create_date =
 			folder->create_date =
 			folder->content_mod_date =
 			folder->attribute_mod_date =
 			folder->access_date = hfsp_now2mt();
-		hfsplus_set_perms(inode, &folder->permissions);
-		if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
+		hfsplus_cat_set_perms(inode, &folder->permissions);
+		if (inode == sbi->hidden_dir)
 			/* invisible and namelocked */
 			folder->user_info.frFlags = cpu_to_be16(0x5000);
 		return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 		file->type = cpu_to_be16(HFSPLUS_FILE);
 		file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
 		file->id = cpu_to_be32(cnid);
-		HFSPLUS_I(inode).create_date =
+		HFSPLUS_I(inode)->create_date =
 			file->create_date =
 			file->content_mod_date =
 			file->attribute_mod_date =
 			file->access_date = hfsp_now2mt();
 		if (cnid == inode->i_ino) {
-			hfsplus_set_perms(inode, &file->permissions);
+			hfsplus_cat_set_perms(inode, &file->permissions);
 			if (S_ISLNK(inode->i_mode)) {
 				file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
 				file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
 			} else {
-				file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
-				file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
+				file->user_info.fdType = cpu_to_be32(sbi->type);
+				file->user_info.fdCreator = cpu_to_be32(sbi->creator);
 			}
 			if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 				file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
 			file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
 			file->user_info.fdFlags = cpu_to_be16(0x100);
-			file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
-			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+			file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
 		}
 		return sizeof(*file);
 	}
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
 
 int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
 {
+	struct super_block *sb = dir->i_sb;
 	struct hfs_find_data fd;
-	struct super_block *sb;
 	hfsplus_cat_entry entry;
 	int entry_size;
 	int err;
 
 	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
-	sb = dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
 	entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
 
 int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 {
-	struct super_block *sb;
+	struct super_block *sb = dir->i_sb;
 	struct hfs_find_data fd;
 	struct hfsplus_fork_raw fork;
 	struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 	u16 type;
 
 	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
-	sb = dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	if (!str) {
 		int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
 	}
 
-	list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) {
+	list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
 		struct hfsplus_readdir_data *rd =
 			list_entry(pos, struct hfsplus_readdir_data, list);
 		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
 		       struct inode *src_dir, struct qstr *src_name,
 		       struct inode *dst_dir, struct qstr *dst_name)
 {
-	struct super_block *sb;
+	struct super_block *sb = src_dir->i_sb;
 	struct hfs_find_data src_fd, dst_fd;
 	hfsplus_cat_entry entry;
 	int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
 
 	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
-	sb = src_dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
 	dst_fd = src_fd;
 
 	/* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca8..d236d85ec9d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
 
 	dentry->d_op = &hfsplus_dentry_operations;
 	dentry->d_fsdata = NULL;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
 again:
 	err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
 		cnid = be32_to_cpu(entry.file.id);
 		if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
 		    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
-		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
-		    HFSPLUS_SB(sb).hidden_dir) {
+		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
+		    HFSPLUS_SB(sb)->hidden_dir) {
 			struct qstr str;
 			char name[32];
 
@@ -86,7 +86,8 @@ again:
 				linkid = be32_to_cpu(entry.file.permissions.dev);
 				str.len = sprintf(name, "iNode%d", linkid);
 				str.name = name;
-				hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
+				hfsplus_cat_build_key(sb, fd.search_key,
+					HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
 				goto again;
 			}
 		} else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (S_ISREG(inode->i_mode))
-		HFSPLUS_I(inode).dev = linkid;
+		HFSPLUS_I(inode)->linkid = linkid;
 out:
 	d_add(dentry, inode);
 	return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (filp->f_pos >= inode->i_size)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
 	err = hfs_brec_find(&fd);
 	if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				err = -EIO;
 				goto out;
 			}
-			if (HFSPLUS_SB(sb).hidden_dir &&
-			    HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id))
+			if (HFSPLUS_SB(sb)->hidden_dir &&
+			    HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+					be32_to_cpu(entry.folder.id))
 				goto next;
 			if (filldir(dirent, strbuf, len, filp->f_pos,
 				    be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		filp->private_data = rd;
 		rd->file = filp;
-		list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list);
+		list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
 	}
 	memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
 out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfsplus_readdir_data *rd = file->private_data;
 	if (rd) {
+		mutex_lock(&inode->i_mutex);
 		list_del(&rd->list);
+		mutex_unlock(&inode->i_mutex);
 		kfree(rd);
 	}
 	return 0;
 }
 
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
-			  struct nameidata *nd)
-{
-	struct inode *inode;
-	int res;
-
-	inode = hfsplus_new_inode(dir->i_sb, mode);
-	if (!inode)
-		return -ENOSPC;
-
-	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
-	hfsplus_instantiate(dentry, inode, inode->i_ino);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
 static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			struct dentry *dst_dentry)
 {
-	struct super_block *sb = dst_dir->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
 	struct inode *inode = src_dentry->d_inode;
 	struct inode *src_dir = src_dentry->d_parent->d_inode;
 	struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 
 	if (HFSPLUS_IS_RSRC(inode))
 		return -EPERM;
+	if (!S_ISREG(inode->i_mode))
+		return -EPERM;
 
+	mutex_lock(&sbi->vh_mutex);
 	if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
 		for (;;) {
 			get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			str.len = sprintf(name, "iNode%d", id);
 			res = hfsplus_rename_cat(inode->i_ino,
 						 src_dir, &src_dentry->d_name,
-						 HFSPLUS_SB(sb).hidden_dir, &str);
+						 sbi->hidden_dir, &str);
 			if (!res)
 				break;
 			if (res != -EEXIST)
-				return res;
+				goto out;
 		}
-		HFSPLUS_I(inode).dev = id;
-		cnid = HFSPLUS_SB(sb).next_cnid++;
+		HFSPLUS_I(inode)->linkid = id;
+		cnid = sbi->next_cnid++;
 		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
 		res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
 		if (res)
 			/* panic? */
-			return res;
-		HFSPLUS_SB(sb).file_count++;
+			goto out;
+		sbi->file_count++;
 	}
-	cnid = HFSPLUS_SB(sb).next_cnid++;
+	cnid = sbi->next_cnid++;
 	res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
 	if (res)
-		return res;
+		goto out;
 
 	inc_nlink(inode);
 	hfsplus_instantiate(dst_dentry, inode, cnid);
 	atomic_inc(&inode->i_count);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	HFSPLUS_SB(sb).file_count++;
-	sb->s_dirt = 1;
-
-	return 0;
+	sbi->file_count++;
+	dst_dir->i_sb->s_dirt = 1;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
 }
 
 static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct super_block *sb = dir->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode = dentry->d_inode;
 	struct qstr str;
 	char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 	if (HFSPLUS_IS_RSRC(inode))
 		return -EPERM;
 
+	mutex_lock(&sbi->vh_mutex);
 	cnid = (u32)(unsigned long)dentry->d_fsdata;
 	if (inode->i_ino == cnid &&
-	    atomic_read(&HFSPLUS_I(inode).opencnt)) {
+	    atomic_read(&HFSPLUS_I(inode)->opencnt)) {
 		str.name = name;
 		str.len = sprintf(name, "temp%lu", inode->i_ino);
 		res = hfsplus_rename_cat(inode->i_ino,
 					 dir, &dentry->d_name,
-					 HFSPLUS_SB(sb).hidden_dir, &str);
+					 sbi->hidden_dir, &str);
 		if (!res)
 			inode->i_flags |= S_DEAD;
-		return res;
+		goto out;
 	}
 	res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
 	if (res)
-		return res;
+		goto out;
 
 	if (inode->i_nlink > 0)
 		drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		clear_nlink(inode);
 	if (!inode->i_nlink) {
 		if (inode->i_ino != cnid) {
-			HFSPLUS_SB(sb).file_count--;
-			if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+			sbi->file_count--;
+			if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
 				res = hfsplus_delete_cat(inode->i_ino,
-							 HFSPLUS_SB(sb).hidden_dir,
+							 sbi->hidden_dir,
 							 NULL);
 				if (!res)
 					hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		} else
 			hfsplus_delete_inode(inode);
 	} else
-		HFSPLUS_SB(sb).file_count--;
+		sbi->file_count--;
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-
+out:
+	mutex_unlock(&sbi->vh_mutex);
 	return res;
 }
 
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct inode *inode;
-	int res;
-
-	inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
-	if (!inode)
-		return -ENOSPC;
-
-	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
-	hfsplus_instantiate(dentry, inode, inode->i_ino);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
 static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	struct inode *inode;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
 	int res;
 
-	inode = dentry->d_inode;
 	if (inode->i_size != 2)
 		return -ENOTEMPTY;
+
+	mutex_lock(&sbi->vh_mutex);
 	res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
 	if (res)
-		return res;
+		goto out;
 	clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	hfsplus_delete_inode(inode);
 	mark_inode_dirty(inode);
-	return 0;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
 }
 
 static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 			   const char *symname)
 {
-	struct super_block *sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
-	int res;
+	int res = -ENOSPC;
 
-	sb = dir->i_sb;
-	inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO);
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
 	if (!inode)
-		return -ENOSPC;
+		goto out;
 
 	res = page_symlink(inode, symname, strlen(symname) + 1);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
+	if (res)
+		goto out_err;
 
-	mark_inode_dirty(inode);
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res)
+		goto out_err;
 
-	if (!res) {
-		hfsplus_instantiate(dentry, inode, inode->i_ino);
-		mark_inode_dirty(inode);
-	}
+	hfsplus_instantiate(dentry, inode, inode->i_ino);
+	mark_inode_dirty(inode);
+	goto out;
 
+out_err:
+	inode->i_nlink = 0;
+	hfsplus_delete_inode(inode);
+	iput(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
 	return res;
 }
 
 static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 			 int mode, dev_t rdev)
 {
-	struct super_block *sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
-	int res;
+	int res = -ENOSPC;
 
-	sb = dir->i_sb;
-	inode = hfsplus_new_inode(sb, mode);
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, mode);
 	if (!inode)
-		return -ENOSPC;
+		goto out;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+		init_special_inode(inode, mode, rdev);
 
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
 	if (res) {
 		inode->i_nlink = 0;
 		hfsplus_delete_inode(inode);
 		iput(inode);
-		return res;
+		goto out;
 	}
-	init_special_inode(inode, mode, rdev);
+
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
 
-	return 0;
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+			  struct nameidata *nd)
+{
+	return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
 }
 
 static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		res = hfsplus_unlink(new_dir, new_dentry);
+		if (S_ISDIR(new_dentry->d_inode->i_mode))
+			res = hfsplus_rmdir(new_dir, new_dentry);
+		else
+			res = hfsplus_unlink(new_dir, new_dentry);
 		if (res)
 			return res;
 	}
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cd..0c9cb1820a5 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
 
 static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
-	hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start,
-			      HFSPLUS_IS_RSRC(inode) ?  HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+			      HFSPLUS_IS_RSRC(inode) ?
+				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
 	res = hfs_brec_find(fd);
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) {
+	if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
 		if (res != -ENOENT)
 			return;
-		hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec));
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hfs_brec_insert(fd, hip->cached_extents,
+				sizeof(hfsplus_extent_rec));
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 	} else {
 		if (res)
 			return;
-		hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength);
-		HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+		hfs_bnode_write(fd->bnode, hip->cached_extents,
+				fd->entryoffset, fd->entrylength);
+		hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
 	}
 }
 
-void hfsplus_ext_write_extent(struct inode *inode)
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
 {
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
+	if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
 		struct hfs_find_data fd;
 
-		hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
 		__hfsplus_ext_write_extent(inode, &fd);
 		hfs_find_exit(&fd);
 	}
 }
 
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+	mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+	hfsplus_ext_write_extent_locked(inode);
+	mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
+
 static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 					    struct hfsplus_extent *extent,
 					    u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 
 static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY)
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
 		__hfsplus_ext_write_extent(inode, fd);
 
-	res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino,
-					block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+	res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+					block, HFSPLUS_IS_RSRC(inode) ?
+						HFSPLUS_TYPE_RSRC :
+						HFSPLUS_TYPE_DATA);
 	if (!res) {
-		HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block);
-		HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents);
+		hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+		hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
 	} else {
-		HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 	}
 	return res;
 }
 
 static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfs_find_data fd;
 	int res;
 
-	if (block >= HFSPLUS_I(inode).cached_start &&
-	    block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
+	if (block >= hip->cached_start &&
+	    block < hip->cached_start + hip->cached_blocks)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
 	res = __hfsplus_ext_cache_extent(&fd, inode, block);
 	hfs_find_exit(&fd);
 	return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		      struct buffer_head *bh_result, int create)
 {
-	struct super_block *sb;
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res = -EIO;
 	u32 ablock, dblock, mask;
 	int shift;
 
-	sb = inode->i_sb;
-
 	/* Convert inode block to disk allocation block */
-	shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits;
-	ablock = iblock >> HFSPLUS_SB(sb).fs_shift;
+	shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+	ablock = iblock >> sbi->fs_shift;
 
-	if (iblock >= HFSPLUS_I(inode).fs_blocks) {
-		if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
+	if (iblock >= hip->fs_blocks) {
+		if (iblock > hip->fs_blocks || !create)
 			return -EIO;
-		if (ablock >= HFSPLUS_I(inode).alloc_blocks) {
+		if (ablock >= hip->alloc_blocks) {
 			res = hfsplus_file_extend(inode);
 			if (res)
 				return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	} else
 		create = 0;
 
-	if (ablock < HFSPLUS_I(inode).first_blocks) {
-		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock);
+	if (ablock < hip->first_blocks) {
+		dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
 		goto done;
 	}
 
 	if (inode->i_ino == HFSPLUS_EXT_CNID)
 		return -EIO;
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&hip->extents_lock);
 	res = hfsplus_ext_read_extent(inode, ablock);
 	if (!res) {
-		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
-					     HFSPLUS_I(inode).cached_start);
+		dblock = hfsplus_ext_find_block(hip->cached_extents,
+						ablock - hip->cached_start);
 	} else {
-		mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+		mutex_unlock(&hip->extents_lock);
 		return -EIO;
 	}
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 
 done:
 	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
-	mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1;
-	map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask));
+	mask = (1 << sbi->fs_shift) - 1;
+	map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
 	if (create) {
 		set_buffer_new(bh_result);
-		HFSPLUS_I(inode).phys_size += sb->s_blocksize;
-		HFSPLUS_I(inode).fs_blocks++;
+		hip->phys_size += sb->s_blocksize;
+		hip->fs_blocks++;
 		inode_add_bytes(inode, sb->s_blocksize);
 		mark_inode_dirty(inode);
 	}
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
 	if (total_blocks == blocks)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
 	do {
 		res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
 						total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
 int hfsplus_file_extend(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	u32 start, len, goal;
 	int res;
 
-	if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
+	if (sbi->alloc_file->i_size * 8 <
+	    sbi->total_blocks - sbi->free_blocks + 8) {
 		// extend alloc file
-		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
-			HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
+		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+				sbi->alloc_file->i_size * 8,
+				sbi->total_blocks, sbi->free_blocks);
 		return -ENOSPC;
 	}
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
-	if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
-		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
+	mutex_lock(&hip->extents_lock);
+	if (hip->alloc_blocks == hip->first_blocks)
+		goal = hfsplus_ext_lastblock(hip->first_extents);
 	else {
-		res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks);
+		res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
 		if (res)
 			goto out;
-		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents);
+		goal = hfsplus_ext_lastblock(hip->cached_extents);
 	}
 
-	len = HFSPLUS_I(inode).clump_blocks;
-	start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len);
-	if (start >= HFSPLUS_SB(sb).total_blocks) {
+	len = hip->clump_blocks;
+	start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+	if (start >= sbi->total_blocks) {
 		start = hfsplus_block_allocate(sb, goal, 0, &len);
 		if (start >= goal) {
 			res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
 	}
 
 	dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
-	if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) {
-		if (!HFSPLUS_I(inode).first_blocks) {
+
+	if (hip->alloc_blocks <= hip->first_blocks) {
+		if (!hip->first_blocks) {
 			dprint(DBG_EXTENT, "first extents\n");
 			/* no extents yet */
-			HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start);
-			HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len);
+			hip->first_extents[0].start_block = cpu_to_be32(start);
+			hip->first_extents[0].block_count = cpu_to_be32(len);
 			res = 0;
 		} else {
 			/* try to append to extents in inode */
-			res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents,
-						 HFSPLUS_I(inode).alloc_blocks,
+			res = hfsplus_add_extent(hip->first_extents,
+						 hip->alloc_blocks,
 						 start, len);
 			if (res == -ENOSPC)
 				goto insert_extent;
 		}
 		if (!res) {
-			hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-			HFSPLUS_I(inode).first_blocks += len;
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks += len;
 		}
 	} else {
-		res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents,
-					 HFSPLUS_I(inode).alloc_blocks -
-					 HFSPLUS_I(inode).cached_start,
+		res = hfsplus_add_extent(hip->cached_extents,
+					 hip->alloc_blocks - hip->cached_start,
 					 start, len);
 		if (!res) {
-			hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-			HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
-			HFSPLUS_I(inode).cached_blocks += len;
+			hfsplus_dump_extent(hip->cached_extents);
+			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->cached_blocks += len;
 		} else if (res == -ENOSPC)
 			goto insert_extent;
 	}
 out:
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 	if (!res) {
-		HFSPLUS_I(inode).alloc_blocks += len;
+		hip->alloc_blocks += len;
 		mark_inode_dirty(inode);
 	}
 	return res;
 
 insert_extent:
 	dprint(DBG_EXTENT, "insert new extent\n");
-	hfsplus_ext_write_extent(inode);
+	hfsplus_ext_write_extent_locked(inode);
 
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start);
-	HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len);
-	hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-	HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
-	HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks;
-	HFSPLUS_I(inode).cached_blocks = len;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_extents[0].start_block = cpu_to_be32(start);
+	hip->cached_extents[0].block_count = cpu_to_be32(len);
+	hfsplus_dump_extent(hip->cached_extents);
+	hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+	hip->cached_start = hip->alloc_blocks;
+	hip->cached_blocks = len;
 
 	res = 0;
 	goto out;
@@ -437,13 +461,15 @@ insert_extent:
 void hfsplus_file_truncate(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfs_find_data fd;
 	u32 alloc_cnt, blk_cnt, start;
 	int res;
 
-	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
-	       (long long)HFSPLUS_I(inode).phys_size, inode->i_size);
-	if (inode->i_size > HFSPLUS_I(inode).phys_size) {
+	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+		inode->i_ino, (long long)hip->phys_size, inode->i_size);
+
+	if (inode->i_size > hip->phys_size) {
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page;
 		void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
 			return;
 		mark_inode_dirty(inode);
 		return;
-	} else if (inode->i_size == HFSPLUS_I(inode).phys_size)
+	} else if (inode->i_size == hip->phys_size)
 		return;
 
-	blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
+	blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+			HFSPLUS_SB(sb)->alloc_blksz_shift;
+	alloc_cnt = hip->alloc_blocks;
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
-	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+	mutex_lock(&hip->extents_lock);
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
 	while (1) {
-		if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
-			hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
+		if (alloc_cnt == hip->first_blocks) {
+			hfsplus_free_extents(sb, hip->first_extents,
 					     alloc_cnt, alloc_cnt - blk_cnt);
-			hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-			HFSPLUS_I(inode).first_blocks = blk_cnt;
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks = blk_cnt;
 			break;
 		}
 		res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
 		if (res)
 			break;
-		start = HFSPLUS_I(inode).cached_start;
-		hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents,
+		start = hip->cached_start;
+		hfsplus_free_extents(sb, hip->cached_extents,
 				     alloc_cnt - start, alloc_cnt - blk_cnt);
-		hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+		hfsplus_dump_extent(hip->cached_extents);
 		if (blk_cnt > start) {
-			HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
 			break;
 		}
 		alloc_cnt = start;
-		HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 
-	HFSPLUS_I(inode).alloc_blocks = blk_cnt;
+	hip->alloc_blocks = blk_cnt;
 out:
-	HFSPLUS_I(inode).phys_size = inode->i_size;
-	HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-	inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+	hip->phys_size = inode->i_size;
+	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
 	mark_inode_dirty(inode);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b..cb3653efb57 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
 	unsigned int depth;
 
 	//unsigned int map1_size, map_size;
-	struct semaphore tree_lock;
+	struct mutex tree_lock;
 
 	unsigned int pages_per_bnode;
 	spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
 	u32 sect_count;
 	int fs_shift;
 
-	/* Stuff in host order from Vol Header */
+	/* immutable data from the volume header */
 	u32 alloc_blksz;
 	int alloc_blksz_shift;
 	u32 total_blocks;
+	u32 data_clump_blocks, rsrc_clump_blocks;
+
+	/* mutable data from the volume header, protected by alloc_mutex */
 	u32 free_blocks;
-	u32 next_alloc;
+	struct mutex alloc_mutex;
+
+	/* mutable data from the volume header, protected by vh_mutex */
 	u32 next_cnid;
 	u32 file_count;
 	u32 folder_count;
-	u32 data_clump_blocks, rsrc_clump_blocks;
+	struct mutex vh_mutex;
 
 	/* Config options */
 	u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
 	int part, session;
 
 	unsigned long flags;
-
-	struct hlist_head rsrc_inodes;
 };
 
-#define HFSPLUS_SB_WRITEBACKUP	0x0001
-#define HFSPLUS_SB_NODECOMPOSE	0x0002
-#define HFSPLUS_SB_FORCE	0x0004
-#define HFSPLUS_SB_HFSX		0x0008
-#define HFSPLUS_SB_CASEFOLD	0x0010
+#define HFSPLUS_SB_WRITEBACKUP	0
+#define HFSPLUS_SB_NODECOMPOSE	1
+#define HFSPLUS_SB_FORCE	2
+#define HFSPLUS_SB_HFSX		3
+#define HFSPLUS_SB_CASEFOLD	4
 
 
 struct hfsplus_inode_info {
-	struct mutex extents_lock;
-	u32 clump_blocks, alloc_blocks;
-	sector_t fs_blocks;
-	/* Allocation extents from catalog record or volume header */
-	hfsplus_extent_rec first_extents;
-	u32 first_blocks;
-	hfsplus_extent_rec cached_extents;
-	u32 cached_start, cached_blocks;
 	atomic_t opencnt;
 
-	struct inode *rsrc_inode;
+	/*
+	 * Extent allocation information, protected by extents_lock.
+	 */
+	u32 first_blocks;
+	u32 clump_blocks;
+	u32 alloc_blocks;
+	u32 cached_start;
+	u32 cached_blocks;
+	hfsplus_extent_rec first_extents;
+	hfsplus_extent_rec cached_extents;
 	unsigned long flags;
+	struct mutex extents_lock;
 
+	/*
+	 * Immutable data.
+	 */
+	struct inode *rsrc_inode;
 	__be32 create_date;
-	/* Device number in hfsplus_permissions in catalog */
-	u32 dev;
-	/* BSD system and user file flags */
-	u8 rootflags;
-	u8 userflags;
 
+	/*
+	 * Protected by sbi->vh_mutex.
+	 */
+	u32 linkid;
+
+	/*
+	 * Protected by i_mutex.
+	 */
+	sector_t fs_blocks;
+	u8 userflags;		/* BSD user file flags */
 	struct list_head open_dir_list;
 	loff_t phys_size;
+
 	struct inode vfs_inode;
 };
 
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
 #define HFSPLUS_FLG_EXT_DIRTY	0x0002
 #define HFSPLUS_FLG_EXT_NEW	0x0004
 
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC))
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
 
 struct hfs_find_data {
 	/* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
 int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
 int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
 		       struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
 
 /* dir.c */
 extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 
 /* access macros */
-/*
 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
+
 static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 {
 	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
 }
-*/
-#define HFSPLUS_SB(super)	(*(struct hfsplus_sb_info *)(super)->s_fs_info)
-#define HFSPLUS_I(inode)	(*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
-
-#if 1
-#define hfsplus_kmap(p)		({ struct page *__p = (p); kmap(__p); })
-#define hfsplus_kunmap(p)	({ struct page *__p = (p); kunmap(__p); __p; })
-#else
-#define hfsplus_kmap(p)		kmap(p)
-#define hfsplus_kunmap(p)	kunmap(p)
-#endif
 
 #define sb_bread512(sb, sec, data) ({			\
 	struct buffer_head *__bh;			\
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 #define hfsp_ut2mt(t)		__hfsp_ut2mt((t).tv_sec)
 #define hfsp_now2mt()		__hfsp_ut2mt(get_seconds())
 
-#define kdev_t_to_nr(x)		(x)
-
 #endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61..6892899fd6f 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
 	struct hfsplus_unistr name;
 } __packed;
 
+#define HFSPLUS_CAT_KEYLEN	(sizeof(struct hfsplus_cat_key))
 
 /* Structs from hfs.h */
 struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
 	__be32 start_block;
 } __packed;
 
-#define HFSPLUS_EXT_KEYLEN 12
+#define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
 
 /* HFS+ generic BTree key */
 typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c6..78449280dae 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
 	*pagep = NULL;
 	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfsplus_get_block,
-				&HFSPLUS_I(mapping->host).phys_size);
+				&HFSPLUS_I(mapping->host)->phys_size);
 	if (unlikely(ret)) {
 		loff_t isize = mapping->host->i_size;
 		if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 
 	switch (inode->i_ino) {
 	case HFSPLUS_EXT_CNID:
-		tree = HFSPLUS_SB(sb).ext_tree;
+		tree = HFSPLUS_SB(sb)->ext_tree;
 		break;
 	case HFSPLUS_CAT_CNID:
-		tree = HFSPLUS_SB(sb).cat_tree;
+		tree = HFSPLUS_SB(sb)->cat_tree;
 		break;
 	case HFSPLUS_ATTR_CNID:
-		tree = HFSPLUS_SB(sb).attr_tree;
+		tree = HFSPLUS_SB(sb)->attr_tree;
 		break;
 	default:
 		BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	struct hfs_find_data fd;
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
+	struct hfsplus_inode_info *hip;
 	int err;
 
 	if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
 		goto out;
 
-	inode = HFSPLUS_I(dir).rsrc_inode;
+	inode = HFSPLUS_I(dir)->rsrc_inode;
 	if (inode)
 		goto out;
 
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	hip = HFSPLUS_I(inode);
 	inode->i_ino = dir->i_ino;
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	hip->flags = HFSPLUS_FLG_RSRC;
 
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	err = hfsplus_find_cat(sb, dir->i_ino, &fd);
 	if (!err)
 		err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 		iput(inode);
 		return ERR_PTR(err);
 	}
-	HFSPLUS_I(inode).rsrc_inode = dir;
-	HFSPLUS_I(dir).rsrc_inode = inode;
+	hip->rsrc_inode = dir;
+	HFSPLUS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
-	hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
+
+	/*
+	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+	 * want resource fork inodes in the regular inode space, we make them
+	 * appear hashed, but do not put on any lists.  hlist_del()
+	 * will work fine and require no locking.
+	 */
+	inode->i_hash.pprev = &inode->i_hash.next;
+
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
 
 static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
 {
-	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	u16 mode;
 
 	mode = be16_to_cpu(perms->mode);
 
 	inode->i_uid = be32_to_cpu(perms->owner);
 	if (!inode->i_uid && !mode)
-		inode->i_uid = HFSPLUS_SB(sb).uid;
+		inode->i_uid = sbi->uid;
 
 	inode->i_gid = be32_to_cpu(perms->group);
 	if (!inode->i_gid && !mode)
-		inode->i_gid = HFSPLUS_SB(sb).gid;
+		inode->i_gid = sbi->gid;
 
 	if (dir) {
-		mode = mode ? (mode & S_IALLUGO) :
-			(S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
+		mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
 		mode |= S_IFDIR;
 	} else if (!mode)
-		mode = S_IFREG | ((S_IRUGO|S_IWUGO) &
-			~(HFSPLUS_SB(sb).umask));
+		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
 	inode->i_mode = mode;
 
-	HFSPLUS_I(inode).rootflags = perms->rootflags;
-	HFSPLUS_I(inode).userflags = perms->userflags;
+	HFSPLUS_I(inode)->userflags = perms->userflags;
 	if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
 		inode->i_flags &= ~S_APPEND;
 }
 
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
-{
-	if (inode->i_flags & S_IMMUTABLE)
-		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
-	else
-		perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-	if (inode->i_flags & S_APPEND)
-		perms->rootflags |= HFSPLUS_FLG_APPEND;
-	else
-		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	perms->userflags = HFSPLUS_I(inode).userflags;
-	perms->mode = cpu_to_be16(inode->i_mode);
-	perms->owner = cpu_to_be32(inode->i_uid);
-	perms->group = cpu_to_be32(inode->i_gid);
-	perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
-}
-
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
-		inode = HFSPLUS_I(inode).rsrc_inode;
+		inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
 		return -EOVERFLOW;
-	atomic_inc(&HFSPLUS_I(inode).opencnt);
+	atomic_inc(&HFSPLUS_I(inode)->opencnt);
 	return 0;
 }
 
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	struct super_block *sb = inode->i_sb;
 
 	if (HFSPLUS_IS_RSRC(inode))
-		inode = HFSPLUS_I(inode).rsrc_inode;
-	if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
+		inode = HFSPLUS_I(inode)->rsrc_inode;
+	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
-			hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+			hfsplus_delete_cat(inode->i_ino,
+					   HFSPLUS_SB(sb)->hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
 		mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
 
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct inode *inode = new_inode(sb);
+	struct hfsplus_inode_info *hip;
+
 	if (!inode)
 		return NULL;
 
-	inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
+	inode->i_ino = sbi->next_cnid++;
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-	HFSPLUS_I(inode).flags = 0;
-	memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).alloc_blocks = 0;
-	HFSPLUS_I(inode).first_blocks = 0;
-	HFSPLUS_I(inode).cached_start = 0;
-	HFSPLUS_I(inode).cached_blocks = 0;
-	HFSPLUS_I(inode).phys_size = 0;
-	HFSPLUS_I(inode).fs_blocks = 0;
-	HFSPLUS_I(inode).rsrc_inode = NULL;
+
+	hip = HFSPLUS_I(inode);
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	atomic_set(&hip->opencnt, 0);
+	hip->flags = 0;
+	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->alloc_blocks = 0;
+	hip->first_blocks = 0;
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+	hip->phys_size = 0;
+	hip->fs_blocks = 0;
+	hip->rsrc_inode = NULL;
 	if (S_ISDIR(inode->i_mode)) {
 		inode->i_size = 2;
-		HFSPLUS_SB(sb).folder_count++;
+		sbi->folder_count++;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (S_ISREG(inode->i_mode)) {
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 		inode->i_op = &hfsplus_file_inode_operations;
 		inode->i_fop = &hfsplus_file_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
-		HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks;
+		hip->clump_blocks = sbi->data_clump_blocks;
 	} else if (S_ISLNK(inode->i_mode)) {
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
-		HFSPLUS_I(inode).clump_blocks = 1;
+		hip->clump_blocks = 1;
 	} else
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 	sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 
 	if (S_ISDIR(inode->i_mode)) {
-		HFSPLUS_SB(sb).folder_count--;
+		HFSPLUS_SB(sb)->folder_count--;
 		sb->s_dirt = 1;
 		return;
 	}
-	HFSPLUS_SB(sb).file_count--;
+	HFSPLUS_SB(sb)->file_count--;
 	if (S_ISREG(inode->i_mode)) {
 		if (!inode->i_nlink) {
 			inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
 void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	u32 count;
 	int i;
 
-	memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents,
-	       sizeof(hfsplus_extent_rec));
+	memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
 	for (count = 0, i = 0; i < 8; i++)
 		count += be32_to_cpu(fork->extents[i].block_count);
-	HFSPLUS_I(inode).first_blocks = count;
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).cached_start = 0;
-	HFSPLUS_I(inode).cached_blocks = 0;
-
-	HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks);
-	inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
-	HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-	inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
-	HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_I(inode).clump_blocks)
-		HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks :
-				HFSPLUS_SB(sb).data_clump_blocks;
+	hip->first_blocks = count;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+
+	hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+	hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+	hip->fs_blocks =
+		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+	hip->clump_blocks =
+		be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+	if (!hip->clump_blocks) {
+		hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+			sbi->rsrc_clump_blocks :
+			sbi->data_clump_blocks;
+	}
 }
 
 void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
-	memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents,
+	memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
 	       sizeof(hfsplus_extent_rec));
 	fork->total_size = cpu_to_be64(inode->i_size);
-	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks);
+	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
 }
 
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 
 	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
 
-	HFSPLUS_I(inode).dev = 0;
+	HFSPLUS_I(inode)->linkid = 0;
 	if (type == HFSPLUS_FOLDER) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_atime = hfsp_mt2ut(folder->access_date);
 		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
-		HFSPLUS_I(inode).create_date = folder->create_date;
-		HFSPLUS_I(inode).fs_blocks = 0;
+		HFSPLUS_I(inode)->create_date = folder->create_date;
+		HFSPLUS_I(inode)->fs_blocks = 0;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_atime = hfsp_mt2ut(file->access_date);
 		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
-		HFSPLUS_I(inode).create_date = file->create_date;
+		HFSPLUS_I(inode)->create_date = file->create_date;
 	} else {
 		printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
 		res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
 	hfsplus_cat_entry entry;
 
 	if (HFSPLUS_IS_RSRC(inode))
-		main_inode = HFSPLUS_I(inode).rsrc_inode;
+		main_inode = HFSPLUS_I(inode)->rsrc_inode;
 
 	if (!main_inode->i_nlink)
 		return 0;
 
-	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd))
+	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
 		/* panic? */
 		return -EIO;
 
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_folder));
 		/* simple node checks? */
-		hfsplus_set_perms(inode, &folder->permissions);
+		hfsplus_cat_set_perms(inode, &folder->permissions);
 		folder->access_date = hfsp_ut2mt(inode->i_atime);
 		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
 		folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->data_fork);
-		if (S_ISREG(inode->i_mode))
-			HFSPLUS_I(inode).dev = inode->i_nlink;
-		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-			HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
-		hfsplus_set_perms(inode, &file->permissions);
+		hfsplus_cat_set_perms(inode, &file->permissions);
 		if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f09902..5b4667e08ef 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
 
-long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	unsigned int flags = 0;
+
+	if (inode->i_flags & S_IMMUTABLE)
+		flags |= FS_IMMUTABLE_FL;
+	if (inode->i_flags |= S_APPEND)
+		flags |= FS_APPEND_FL;
+	if (hip->userflags & HFSPLUS_FLG_NODUMP)
+		flags |= FS_NODUMP_FL;
+
+	return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags;
+	int err = 0;
 
-	lock_kernel();
-	switch (cmd) {
-	case HFSPLUS_IOC_EXT2_GETFLAGS:
-		flags = 0;
-		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
-			flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
-		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
-			flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
-		if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
-			flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
-		return put_user(flags, (int __user *)arg);
-	case HFSPLUS_IOC_EXT2_SETFLAGS: {
-		int err = 0;
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err) {
-			unlock_kernel();
-			return err;
-		}
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out;
 
-		if (!is_owner_or_cap(inode)) {
-			err = -EACCES;
-			goto setflags_out;
-		}
-		if (get_user(flags, (int __user *)arg)) {
-			err = -EFAULT;
-			goto setflags_out;
-		}
-		if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-		    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				err = -EPERM;
-				goto setflags_out;
-			}
-		}
+	if (!is_owner_or_cap(inode)) {
+		err = -EACCES;
+		goto out_drop_write;
+	}
 
-		/* don't silently ignore unsupported ext2 flags */
-		if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
-			err = -EOPNOTSUPP;
-			goto setflags_out;
-		}
-		if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
-			inode->i_flags |= S_IMMUTABLE;
-			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
-		} else {
-			inode->i_flags &= ~S_IMMUTABLE;
-			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-		}
-		if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
-			inode->i_flags |= S_APPEND;
-			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
-		} else {
-			inode->i_flags &= ~S_APPEND;
-			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+	if (get_user(flags, user_flags)) {
+		err = -EFAULT;
+		goto out_drop_write;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_unlock_inode;
 		}
-		if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
-			HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
-		else
-			HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
-
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-setflags_out:
-		mnt_drop_write(filp->f_path.mnt);
-		unlock_kernel();
-		return err;
 	}
+
+	/* don't silently ignore unsupported ext2 flags */
+	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+		err = -EOPNOTSUPP;
+		goto out_unlock_inode;
+	}
+
+	if (flags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+
+	if (flags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+
+	if (flags & FS_NODUMP_FL)
+		hip->userflags |= HFSPLUS_FLG_NODUMP;
+	else
+		hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+out_unlock_inode:
+	mutex_lock(&inode->i_mutex);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+out:
+	return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case HFSPLUS_IOC_EXT2_GETFLAGS:
+		return hfsplus_ioctl_getflags(file, argp);
+	case HFSPLUS_IOC_EXT2_SETFLAGS:
+		return hfsplus_ioctl_setflags(file, argp);
 	default:
-		unlock_kernel();
 		return -ENOTTY;
 	}
 }
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
 		return -EOPNOTSUPP;
 
-	res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 	if (res)
 		return res;
 	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 		return -EOPNOTSUPP;
 
 	if (size) {
-		res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 		if (res)
 			return res;
 		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 		} else
 			res = size ? -ERANGE : 4;
 	} else
-		res = -ENODATA;
+		res = -EOPNOTSUPP;
 out:
 	if (size)
 		hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07..f9ab276a4d8 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 			kfree(p);
 			break;
 		case opt_decompose:
-			sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE;
+			clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
 			break;
 		case opt_nodecompose:
-			sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
+			set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
 			break;
 		case opt_force:
-			sbi->flags |= HFSPLUS_SB_FORCE;
+			set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
 			break;
 		default:
 			return 0;
@@ -171,7 +171,7 @@ done:
 
 int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 {
-	struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
 
 	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
 		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 		seq_printf(seq, ",session=%u", sbi->session);
 	if (sbi->nls)
 		seq_printf(seq, ",nls=%s", sbi->nls->charset);
-	if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+	if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
 		seq_printf(seq, ",nodecompose");
 	return 0;
 }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd029..208b16c645c 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
 int hfs_part_find(struct super_block *sb,
 		  sector_t *part_start, sector_t *part_size)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct buffer_head *bh;
 	__be16 *data;
 	int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
 		for (i = 0; i < size; p++, i++) {
 			if (p->pdStart && p->pdSize &&
 			    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-			    (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+			    (sbi->part < 0 || sbi->part == i)) {
 				*part_start += be32_to_cpu(p->pdStart);
 				*part_size = be32_to_cpu(p->pdSize);
 				res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
 		size = be32_to_cpu(pm->pmMapBlkCnt);
 		for (i = 0; i < size;) {
 			if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-			    (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+			    (sbi->part < 0 || sbi->part == i)) {
 				*part_start += be32_to_cpu(pm->pmPyPartStart);
 				*part_size = be32_to_cpu(pm->pmPartBlkCnt);
 				res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c74..9a88d753610 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
 #include <linux/pagemap.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/nls.h>
 
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
 
 #include "hfsplus_fs.h"
 
-struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+static int hfsplus_system_read_inode(struct inode *inode)
 {
-	struct hfs_find_data fd;
-	struct hfsplus_vh *vhdr;
-	struct inode *inode;
-	long err = -EIO;
-
-	inode = iget_locked(sb, ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
-		return inode;
+	struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
 
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	HFSPLUS_I(inode).flags = 0;
-	HFSPLUS_I(inode).rsrc_inode = NULL;
-	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-
-	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-	read_inode:
-		hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
-		err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
-		if (!err)
-			err = hfsplus_cat_read_inode(inode, &fd);
-		hfs_find_exit(&fd);
-		if (err)
-			goto bad_inode;
-		goto done;
-	}
-	vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
-	switch(inode->i_ino) {
-	case HFSPLUS_ROOT_CNID:
-		goto read_inode;
+	switch (inode->i_ino) {
 	case HFSPLUS_EXT_CNID:
 		hfsplus_inode_read_fork(inode, &vhdr->ext_file);
 		inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 		inode->i_mapping->a_ops = &hfsplus_btree_aops;
 		break;
 	default:
-		goto bad_inode;
+		return -EIO;
+	}
+
+	return 0;
+}
+
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+	struct hfs_find_data fd;
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+	mutex_init(&HFSPLUS_I(inode)->extents_lock);
+	HFSPLUS_I(inode)->flags = 0;
+	HFSPLUS_I(inode)->rsrc_inode = NULL;
+	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID) {
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+		err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+		if (!err)
+			err = hfsplus_cat_read_inode(inode, &fd);
+		hfs_find_exit(&fd);
+	} else {
+		err = hfsplus_system_read_inode(inode);
+	}
+
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
 	}
 
-done:
 	unlock_new_inode(inode);
 	return inode;
-
-bad_inode:
-	iget_failed(inode);
-	return ERR_PTR(err);
 }
 
-static int hfsplus_write_inode(struct inode *inode,
-		struct writeback_control *wbc)
+static int hfsplus_system_write_inode(struct inode *inode)
 {
-	struct hfsplus_vh *vhdr;
-	int ret = 0;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	struct hfsplus_fork_raw *fork;
+	struct hfs_btree *tree = NULL;
 
-	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
-	hfsplus_ext_write_extent(inode);
-	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-		return hfsplus_cat_write_inode(inode);
-	}
-	vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
 	switch (inode->i_ino) {
-	case HFSPLUS_ROOT_CNID:
-		ret = hfsplus_cat_write_inode(inode);
-		break;
 	case HFSPLUS_EXT_CNID:
-		if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->ext_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
+		fork = &vhdr->ext_file;
+		tree = sbi->ext_tree;
 		break;
 	case HFSPLUS_CAT_CNID:
-		if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->cat_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
+		fork = &vhdr->cat_file;
+		tree = sbi->cat_tree;
 		break;
 	case HFSPLUS_ALLOC_CNID:
-		if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
+		fork = &vhdr->alloc_file;
 		break;
 	case HFSPLUS_START_CNID:
-		if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->start_file);
+		fork = &vhdr->start_file;
 		break;
 	case HFSPLUS_ATTR_CNID:
-		if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->attr_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree);
-		break;
+		fork = &vhdr->attr_file;
+		tree = sbi->attr_tree;
+	default:
+		return -EIO;
+	}
+
+	if (fork->total_size != cpu_to_be64(inode->i_size)) {
+		set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+		inode->i_sb->s_dirt = 1;
 	}
-	return ret;
+	hfsplus_inode_write_fork(inode, fork);
+	if (tree)
+		hfs_btree_write(tree);
+	return 0;
+}
+
+static int hfsplus_write_inode(struct inode *inode,
+		struct writeback_control *wbc)
+{
+	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+	hfsplus_ext_write_extent(inode);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID)
+		return hfsplus_cat_write_inode(inode);
+	else
+		return hfsplus_system_write_inode(inode);
 }
 
 static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
 	if (HFSPLUS_IS_RSRC(inode)) {
-		HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
-		iput(HFSPLUS_I(inode).rsrc_inode);
+		HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+		iput(HFSPLUS_I(inode)->rsrc_inode);
 	}
 }
 
 int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
-	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
 
-	lock_super(sb);
+	mutex_lock(&sbi->vh_mutex);
+	mutex_lock(&sbi->alloc_mutex);
 	sb->s_dirt = 0;
 
-	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
-	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
-	vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid);
-	vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count);
-	vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
+	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+	vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+	vhdr->file_count = cpu_to_be32(sbi->file_count);
 
-	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-	if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) {
-		if (HFSPLUS_SB(sb).sect_count) {
+	mark_buffer_dirty(sbi->s_vhbh);
+	if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
+		if (sbi->sect_count) {
 			struct buffer_head *bh;
 			u32 block, offset;
 
-			block = HFSPLUS_SB(sb).blockoffset;
-			block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
-			offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
-			printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
-				HFSPLUS_SB(sb).sect_count, block, offset);
+			block = sbi->blockoffset;
+			block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
+			offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
+			printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
+					  sbi->blockoffset, sbi->sect_count,
+					  block, offset);
 			bh = sb_bread(sb, block);
 			if (bh) {
 				vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
 				if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-					memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr));
+					memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
 					mark_buffer_dirty(bh);
 					brelse(bh);
 				} else
 					printk(KERN_WARNING "hfs: backup not found!\n");
 			}
 		}
-		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
-	unlock_super(sb);
+	mutex_unlock(&sbi->alloc_mutex);
+	mutex_unlock(&sbi->vh_mutex);
 	return 0;
 }
 
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
 
 static void hfsplus_put_super(struct super_block *sb)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
+
 	if (!sb->s_fs_info)
 		return;
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		hfsplus_write_super(sb);
-	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
-		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+	if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+		struct hfsplus_vh *vhdr = sbi->s_vhdr;
 
 		vhdr->modify_date = hfsp_now2mt();
 		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
 		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-		mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-		sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+		mark_buffer_dirty(sbi->s_vhbh);
+		sync_dirty_buffer(sbi->s_vhbh);
 	}
 
-	hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
-	hfs_btree_close(HFSPLUS_SB(sb).ext_tree);
-	iput(HFSPLUS_SB(sb).alloc_file);
-	iput(HFSPLUS_SB(sb).hidden_dir);
-	brelse(HFSPLUS_SB(sb).s_vhbh);
-	unload_nls(HFSPLUS_SB(sb).nls);
+	hfs_btree_close(sbi->cat_tree);
+	hfs_btree_close(sbi->ext_tree);
+	iput(sbi->alloc_file);
+	iput(sbi->hidden_dir);
+	brelse(sbi->s_vhbh);
+	unload_nls(sbi->nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
-	buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift;
+	buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+	buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = 0xFFFFFFFF;
-	buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+	buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (!(*flags & MS_RDONLY)) {
-		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
 		struct hfsplus_sb_info sbi;
 
 		memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
-		sbi.nls = HFSPLUS_SB(sb).nls;
+		sbi.nls = HFSPLUS_SB(sb)->nls;
 		if (!hfsplus_parse_options(data, &sbi))
 			return -EINVAL;
 
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 			       "running fsck.hfsplus is recommended.  leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
-		} else if (sbi.flags & HFSPLUS_SB_FORCE) {
+		} else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
 			/* nothing */
 		} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 			printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
-	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
+	mutex_init(&sbi->alloc_mutex);
+	mutex_init(&sbi->vh_mutex);
 	hfsplus_fill_defaults(sbi);
 	if (!hfsplus_parse_options(data, sbi)) {
 		printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = -EINVAL;
 		goto cleanup;
 	}
-	vhdr = HFSPLUS_SB(sb).s_vhdr;
+	vhdr = sbi->s_vhdr;
 
 	/* Copy parts of the volume header into the superblock */
 	sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "hfs: wrong filesystem version\n");
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
-	HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks);
-	HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc);
-	HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid);
-	HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count);
-	HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count);
-	HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_SB(sb).data_clump_blocks)
-		HFSPLUS_SB(sb).data_clump_blocks = 1;
-	HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_SB(sb).rsrc_clump_blocks)
-		HFSPLUS_SB(sb).rsrc_clump_blocks = 1;
+	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+	sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+	sbi->file_count = be32_to_cpu(vhdr->file_count);
+	sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+	sbi->data_clump_blocks =
+		be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->data_clump_blocks)
+		sbi->data_clump_blocks = 1;
+	sbi->rsrc_clump_blocks =
+		be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->rsrc_clump_blocks)
+		sbi->rsrc_clump_blocks = 1;
 
 	/* Set up operations so we can load metadata */
 	sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
 		       "running fsck.hfsplus is recommended.  mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
-	} else if (sbi->flags & HFSPLUS_SB_FORCE) {
+	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		       "use the force option at your own risk, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
-	sbi->flags &= ~HFSPLUS_SB_FORCE;
 
 	/* Load metadata objects (B*Trees) */
-	HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
-	if (!HFSPLUS_SB(sb).ext_tree) {
+	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+	if (!sbi->ext_tree) {
 		printk(KERN_ERR "hfs: failed to load extents file\n");
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
-	if (!HFSPLUS_SB(sb).cat_tree) {
+	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+	if (!sbi->cat_tree) {
 		printk(KERN_ERR "hfs: failed to load catalog file\n");
 		goto cleanup;
 	}
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = PTR_ERR(inode);
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).alloc_file = inode;
+	sbi->alloc_file = inode;
 
 	/* Load the root directory */
 	root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
 	str.name = HFSP_HIDDENDIR_NAME;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(sbi->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
 	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
 		hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 			err = PTR_ERR(inode);
 			goto cleanup;
 		}
-		HFSPLUS_SB(sb).hidden_dir = inode;
+		sbi->hidden_dir = inode;
 	} else
 		hfs_find_exit(&fd);
 
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	be32_add_cpu(&vhdr->write_count, 1);
 	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
 	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-	sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+	mark_buffer_dirty(sbi->s_vhbh);
+	sync_dirty_buffer(sbi->s_vhbh);
 
-	if (!HFSPLUS_SB(sb).hidden_dir) {
+	if (!sbi->hidden_dir) {
 		printk(KERN_DEBUG "hfs: create hidden dir...\n");
-		HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-		hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
-				   &str, HFSPLUS_SB(sb).hidden_dir);
-		mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
+
+		mutex_lock(&sbi->vh_mutex);
+		sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+		hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
+				   &str, sbi->hidden_dir);
+		mutex_unlock(&sbi->vh_mutex);
+
+		mark_inode_dirty(sbi->hidden_dir);
 	}
 out:
 	unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
 
 static void hfsplus_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode));
+	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa40..b66d67de882 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
 int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
 {
 	const hfsplus_unichr *ip;
-	struct nls_table *nls = HFSPLUS_SB(sb).nls;
+	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
 	u8 *op;
 	u16 cc, c0, c1;
 	u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
 	ustrlen = be16_to_cpu(ustr->length);
 	len = *len_p;
 	ce1 = NULL;
-	compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 
 	while (ustrlen > 0) {
 		c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
 static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
 			      wchar_t *uc)
 {
-	int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
+	int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
 	if (size <= 0) {
 		*uc = '?';
 		size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 	u16 *dstr, outlen = 0;
 	wchar_t c;
 
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
 		size = asc2unichar(sb, astr, len, &c);
 
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 	wchar_t c;
 	u16 c2;
 
-	casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	hash = init_name_hash();
 	astr = str->name;
 	len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 	u16 c1, c2;
 	wchar_t c;
 
-	casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	astr1 = s1->name;
 	len1 = s1->len;
 	astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d..8972c20b321 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
 	*start = 0;
 	*size = sb->s_bdev->bd_inode->i_size >> 9;
 
-	if (HFSPLUS_SB(sb).session >= 0) {
-		te.cdte_track = HFSPLUS_SB(sb).session;
+	if (HFSPLUS_SB(sb)->session >= 0) {
+		te.cdte_track = HFSPLUS_SB(sb)->session;
 		te.cdte_format = CDROM_LBA;
 		res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
 		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
 /* Takes in super block, returns true if good data read */
 int hfsplus_read_wrapper(struct super_block *sb)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct buffer_head *bh;
 	struct hfsplus_vh *vhdr;
 	struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
 			break;
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-			HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
+			set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
 			break;
 		}
 		brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
 	if (blocksize < HFSPLUS_SECTOR_SIZE ||
 	    ((blocksize - 1) & blocksize))
 		return -EINVAL;
-	HFSPLUS_SB(sb).alloc_blksz = blocksize;
-	HFSPLUS_SB(sb).alloc_blksz_shift = 0;
+	sbi->alloc_blksz = blocksize;
+	sbi->alloc_blksz_shift = 0;
 	while ((blocksize >>= 1) != 0)
-		HFSPLUS_SB(sb).alloc_blksz_shift++;
-	blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE);
+		sbi->alloc_blksz_shift++;
+	blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
 
 	/* align block size to block offset */
 	while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		return -EINVAL;
 	}
 
-	HFSPLUS_SB(sb).blockoffset = part_start >>
-			(sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
-	HFSPLUS_SB(sb).sect_count = part_size;
-	HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift -
-			sb->s_blocksize_bits;
+	sbi->blockoffset =
+		part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+	sbi->sect_count = part_size;
+	sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
 
 	bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
 	if (!bh)
 		return -EIO;
 
 	/* should still be the same... */
-	if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
-				cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
-				cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
-		goto error;
-	HFSPLUS_SB(sb).s_vhbh = bh;
-	HFSPLUS_SB(sb).s_vhdr = vhdr;
+	if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+			goto error;
+	} else {
+		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
+			goto error;
+	}
+
+	sbi->s_vhbh = bh;
+	sbi->s_vhdr = vhdr;
 
 	return 0;
  error:
diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h
index 62f59080e5c..04d0a977cd4 100644
--- a/include/asm-generic/hardirq.h
+++ b/include/asm-generic/hardirq.h
@@ -3,13 +3,13 @@
 
 #include <linux/cache.h>
 #include <linux/threads.h>
-#include <linux/irq.h>
 
 typedef struct {
 	unsigned int __softirq_pending;
 } ____cacheline_aligned irq_cpustat_t;
 
 #include <linux/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
+#include <linux/irq.h>
 
 #ifndef ack_bad_irq
 static inline void ack_bad_irq(unsigned int irq)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2bd73e8f9c..f4d4120e512 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -129,6 +129,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 #define move_pte(pte, prot, old_addr, new_addr)	(pte)
 #endif
 
+#ifndef flush_tlb_fix_spurious_fault
+#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
+#endif
+
 #ifndef pgprot_noncached
 #define pgprot_noncached(prot)	(prot)
 #endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8a92a170fb7..f4229fb315e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -220,6 +220,8 @@
 									\
 	BUG_TABLE							\
 									\
+	JUMP_TABLE							\
+									\
 	/* PCI quirks */						\
 	.pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start_pci_fixups_early) = .;		\
@@ -563,6 +565,14 @@
 #define BUG_TABLE
 #endif
 
+#define JUMP_TABLE							\
+	. = ALIGN(8);							\
+	__jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(__start___jump_table) = .;		\
+		*(__jump_table)						\
+		VMLINUX_SYMBOL(__stop___jump_table) = .;		\
+	}
+
 #ifdef CONFIG_PM_TRACE
 #define TRACEDATA							\
 	. = ALIGN(4);							\
@@ -677,7 +687,9 @@
 				- LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
 		*(.data..percpu..first)					\
+		. = ALIGN(PAGE_SIZE);					\
 		*(.data..percpu..page_aligned)				\
+		*(.data..percpu..readmostly)				\
 		*(.data..percpu)					\
 		*(.data..percpu..shared_aligned)			\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
@@ -703,7 +715,9 @@
 		VMLINUX_SYMBOL(__per_cpu_load) = .;			\
 		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
 		*(.data..percpu..first)					\
+		. = ALIGN(PAGE_SIZE);					\
 		*(.data..percpu..page_aligned)				\
+		*(.data..percpu..readmostly)				\
 		*(.data..percpu)					\
 		*(.data..percpu..shared_aligned)			\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h
index 7e3d2859be5..1d0ef1ae803 100644
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -25,8 +25,6 @@ static inline u32 acpi_pm_read_early(void)
 	return acpi_pm_read_verified() & ACPI_PM_MASK;
 }
 
-extern void pmtimer_wait(unsigned);
-
 #else
 
 static inline u32 acpi_pm_read_early(void)
diff --git a/fs/ceph/auth.h b/include/linux/ceph/auth.h
index d38a2fb4a13..7fff521d7eb 100644
--- a/fs/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -1,8 +1,8 @@
 #ifndef _FS_CEPH_AUTH_H
 #define _FS_CEPH_AUTH_H
 
-#include "types.h"
-#include "buffer.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
 
 /*
  * Abstract interface for communicating with the authenticate module.
diff --git a/fs/ceph/buffer.h b/include/linux/ceph/buffer.h
index 58d19014068..58d19014068 100644
--- a/fs/ceph/buffer.h
+++ b/include/linux/ceph/buffer.h
diff --git a/fs/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index 1818c230561..aa2e19182d9 100644
--- a/fs/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -3,7 +3,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
 
 /*
  * wrap pr_debug to include a filename:lineno prefix on each line.
@@ -14,7 +14,8 @@
 # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
 extern const char *ceph_file_part(const char *s, int len);
 #  define dout(fmt, ...)						\
-	pr_debug(" %12.12s:%-4d : " fmt,				\
+	pr_debug("%.*s %12.12s:%-4d : " fmt,				\
+		 8 - (int)sizeof(KBUILD_MODNAME), "    ",		\
 		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
 		 __LINE__, ##__VA_ARGS__)
 # else
diff --git a/fs/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index 5babb8e9535..5babb8e9535 100644
--- a/fs/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
diff --git a/fs/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d5619ac8671..c3c74aef289 100644
--- a/fs/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -299,6 +299,7 @@ enum {
 	CEPH_MDS_OP_SETATTR    = 0x01108,
 	CEPH_MDS_OP_SETFILELOCK= 0x01109,
 	CEPH_MDS_OP_GETFILELOCK= 0x00110,
+	CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
 
 	CEPH_MDS_OP_MKNOD      = 0x01201,
 	CEPH_MDS_OP_LINK       = 0x01202,
diff --git a/fs/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h
index d099c3f9023..d099c3f9023 100644
--- a/fs/ceph/ceph_hash.h
+++ b/include/linux/ceph/ceph_hash.h
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
new file mode 100644
index 00000000000..2a79702e092
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,33 @@
+#ifndef _FS_CEPH_DEBUGFS_H
+#define _FS_CEPH_DEBUGFS_H
+
+#include "ceph_debug.h"
+#include "types.h"
+
+#define CEPH_DEFINE_SHOW_FUNC(name)					\
+static int name##_open(struct inode *inode, struct file *file)		\
+{									\
+	struct seq_file *sf;						\
+	int ret;							\
+									\
+	ret = single_open(file, name, NULL);				\
+	sf = file->private_data;					\
+	sf->private = inode->i_private;					\
+	return ret;							\
+}									\
+									\
+static const struct file_operations name##_fops = {			\
+	.open		= name##_open,					\
+	.read		= seq_read,					\
+	.llseek		= seq_lseek,					\
+	.release	= single_release,				\
+};
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+#endif
+
diff --git a/fs/ceph/decode.h b/include/linux/ceph/decode.h
index 3d25415afe6..c5b6939fb32 100644
--- a/fs/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end,
 		ceph_encode_need(p, end, n, bad);		\
 		ceph_encode_copy(p, pv, n);			\
 	} while (0)
+#define ceph_encode_string_safe(p, end, s, n, bad)		\
+	do {							\
+		ceph_encode_need(p, end, n, bad);		\
+		ceph_encode_string(p, end, s, n);		\
+	} while (0)
 
 
 #endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
new file mode 100644
index 00000000000..f22b2e94168
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,249 @@
+#ifndef _FS_CEPH_LIBCEPH_H
+#define _FS_CEPH_LIBCEPH_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/*
+ * Supported features
+ */
+#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
+
+#define CEPH_OPT_DEFAULT   (0);
+
+#define ceph_set_opt(client, opt) \
+	(client)->options->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+	(!!((client)->options->flags & CEPH_OPT_##opt))
+
+struct ceph_options {
+	int flags;
+	struct ceph_fsid fsid;
+	struct ceph_entity_addr my_addr;
+	int mount_timeout;
+	int osd_idle_ttl;
+	int osd_timeout;
+	int osd_keepalive_timeout;
+
+	/*
+	 * any type that can't be simply compared or doesn't need need
+	 * to be compared should go beyond this point,
+	 * ceph_compare_options() should be updated accordingly
+	 */
+
+	struct ceph_entity_addr *mon_addr; /* should be the first
+					      pointer type of args */
+	int num_mon;
+	char *name;
+	char *secret;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
+
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
+
+/* mount state */
+enum {
+	CEPH_MOUNT_MOUNTING,
+	CEPH_MOUNT_MOUNTED,
+	CEPH_MOUNT_UNMOUNTING,
+	CEPH_MOUNT_UNMOUNTED,
+	CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+	BUG_ON(time_after(b, a));
+	return (long)a - (long)b;
+}
+
+struct ceph_mds_client;
+
+/*
+ * per client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+	struct ceph_fsid fsid;
+	bool have_fsid;
+
+	void *private;
+
+	struct ceph_options *options;
+
+	struct mutex mount_mutex;      /* serialize mount attempts */
+	wait_queue_head_t auth_wq;
+	int auth_err;
+
+	int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
+
+	u32 supported_features;
+	u32 required_features;
+
+	struct ceph_messenger *msgr;   /* messenger instance */
+	struct ceph_mon_client monc;
+	struct ceph_osd_client osdc;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_monmap;
+	struct dentry *debugfs_osdmap;
+#endif
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+	atomic_t nref;
+	u64 seq;
+	int num_snaps;
+	u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+	/*
+	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)+1);
+	*/
+	if (sc)
+		atomic_inc(&sc->nref);
+	return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+	if (!sc)
+		return;
+	/*
+	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)-1);
+	*/
+	if (atomic_dec_and_test(&sc->nref)) {
+		/*printk(" deleting snap_context %p\n", sc);*/
+		kfree(sc);
+	}
+}
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+		(off >> PAGE_CACHE_SHIFT);
+}
+
+/* ceph_common.c */
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern int ceph_parse_options(struct ceph_options **popt, char *options,
+			      const char *dev_name, const char *dev_name_end,
+			      int (*parse_extra_token)(char *c, void *private),
+			      void *private);
+extern void ceph_destroy_options(struct ceph_options *opt);
+extern int ceph_compare_options(struct ceph_options *new_opt,
+				struct ceph_client *client);
+extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
+					      void *private);
+extern u64 ceph_client_id(struct ceph_client *client);
+extern void ceph_destroy_client(struct ceph_client *client);
+extern int __ceph_open_session(struct ceph_client *client,
+			       unsigned long started);
+extern int ceph_open_session(struct ceph_client *client);
+
+/* pagevec.c */
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+extern struct page **ceph_get_direct_page_vector(const char __user *data,
+					    int num_pages,
+					    loff_t off, size_t len);
+extern void ceph_put_page_vector(struct page **pages, int num_pages);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+extern int ceph_copy_user_to_page_vector(struct page **pages,
+					 const char __user *data,
+					 loff_t off, size_t len);
+extern int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
+				    loff_t off, size_t len);
+extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
+
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 4c5cb0880bb..4c5cb0880bb 100644
--- a/fs/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
diff --git a/fs/ceph/messenger.h b/include/linux/ceph/messenger.h
index 76fbc957bc1..5956d62c305 100644
--- a/fs/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -65,6 +65,9 @@ struct ceph_messenger {
 	 */
 	u32 global_seq;
 	spinlock_t global_seq_lock;
+
+	u32 supported_features;
+	u32 required_features;
 };
 
 /*
@@ -82,6 +85,10 @@ struct ceph_msg {
 	struct ceph_pagelist *pagelist; /* instead of pages */
 	struct list_head list_head;
 	struct kref kref;
+	struct bio  *bio;		/* instead of pages/pagelist */
+	struct bio  *bio_iter;		/* bio iterator */
+	int bio_seg;			/* current bio segment */
+	struct ceph_pagelist *trail;	/* the trailing part of the data */
 	bool front_is_vmalloc;
 	bool more_to_follow;
 	bool needs_out_seq;
@@ -205,7 +212,7 @@ struct ceph_connection {
 };
 
 
-extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
 extern int ceph_parse_ips(const char *c, const char *end,
 			  struct ceph_entity_addr *addr,
 			  int max_count, int *count);
@@ -216,7 +223,8 @@ extern void ceph_msgr_exit(void);
 extern void ceph_msgr_flush(void);
 
 extern struct ceph_messenger *ceph_messenger_create(
-	struct ceph_entity_addr *myaddr);
+	struct ceph_entity_addr *myaddr,
+	u32 features, u32 required);
 extern void ceph_messenger_destroy(struct ceph_messenger *);
 
 extern void ceph_con_init(struct ceph_messenger *msgr,
diff --git a/fs/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 8e396f2c096..545f8591778 100644
--- a/fs/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -79,6 +79,7 @@ struct ceph_mon_client {
 	u64 last_tid;
 
 	/* mds/osd map */
+	int want_mdsmap;
 	int want_next_osdmap; /* 1 = want, 2 = want+asked */
 	u32 have_osdmap, have_mdsmap;
 
diff --git a/fs/ceph/msgpool.h b/include/linux/ceph/msgpool.h
index a362605f936..a362605f936 100644
--- a/fs/ceph/msgpool.h
+++ b/include/linux/ceph/msgpool.h
diff --git a/fs/ceph/msgr.h b/include/linux/ceph/msgr.h
index 680d3d648ca..680d3d648ca 100644
--- a/fs/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
diff --git a/fs/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ce776989ef6..6c91fb032c3 100644
--- a/fs/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -15,6 +15,7 @@ struct ceph_snap_context;
 struct ceph_osd_request;
 struct ceph_osd_client;
 struct ceph_authorizer;
+struct ceph_pagelist;
 
 /*
  * completion callback for async writepages
@@ -68,6 +69,7 @@ struct ceph_osd_request {
 	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
+	void *r_priv;			      /* ditto */
 
 	char              r_oid[40];          /* object name */
 	int               r_oid_len;
@@ -80,6 +82,11 @@ struct ceph_osd_request {
 	struct page     **r_pages;            /* pages for data payload */
 	int               r_pages_from_pool;
 	int               r_own_pages;        /* if true, i own page list */
+#ifdef CONFIG_BLOCK
+	struct bio       *r_bio;	      /* instead of pages */
+#endif
+
+	struct ceph_pagelist *r_trail;	      /* trailing part of the data */
 };
 
 struct ceph_osd_client {
@@ -110,6 +117,42 @@ struct ceph_osd_client {
 	struct ceph_msgpool	msgpool_op_reply;
 };
 
+struct ceph_osd_req_op {
+	u16 op;           /* CEPH_OSD_OP_* */
+	u32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			u64 offset, length;
+			u64 truncate_size;
+			u32 truncate_seq;
+		} extent;
+		struct {
+			const char *name;
+			u32 name_len;
+			const char  *val;
+			u32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+		} xattr;
+		struct {
+			const char *class_name;
+			__u8 class_len;
+			const char *method_name;
+			__u8 method_len;
+			__u8 argc;
+			const char *indata;
+			u32 indata_len;
+		} cls;
+		struct {
+			u64 cookie, count;
+		} pgls;
+	        struct {
+		        u64 snapid;
+	        } snap;
+	};
+	u32 payload_len;
+};
+
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
 			  struct ceph_client *client);
 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
@@ -119,6 +162,30 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
+extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op);
+
+extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
+					       struct ceph_snap_context *snapc,
+					       struct ceph_osd_req_op *ops,
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages,
+					       struct bio *bio);
+
+extern void ceph_osdc_build_request(struct ceph_osd_request *req,
+				    u64 off, u64 *plen,
+				    struct ceph_osd_req_op *src_ops,
+				    struct ceph_snap_context *snapc,
+				    struct timespec *mtime,
+				    const char *oid,
+				    int oid_len);
+
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
 				      struct ceph_vino vino,
diff --git a/fs/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 970b547e510..ba4c205cbb0 100644
--- a/fs/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -4,7 +4,7 @@
 #include <linux/rbtree.h>
 #include "types.h"
 #include "ceph_fs.h"
-#include "crush/crush.h"
+#include <linux/crush/crush.h>
 
 /*
  * The osd map describes the current membership of the osd cluster and
@@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
 				struct ceph_pg pgid);
 
+extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+
 #endif
diff --git a/fs/ceph/pagelist.h b/include/linux/ceph/pagelist.h
index e8a4187e108..9660d6b0a35 100644
--- a/fs/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -8,6 +8,14 @@ struct ceph_pagelist {
 	void *mapped_tail;
 	size_t length;
 	size_t room;
+	struct list_head free_list;
+	size_t num_pages_free;
+};
+
+struct ceph_pagelist_cursor {
+	struct ceph_pagelist *pl;   /* pagelist, for error checking */
+	struct list_head *page_lru; /* page in list */
+	size_t room;		    /* room remaining to reset to */
 };
 
 static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
@@ -16,10 +24,23 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
 	pl->mapped_tail = NULL;
 	pl->length = 0;
 	pl->room = 0;
+	INIT_LIST_HEAD(&pl->free_list);
+	pl->num_pages_free = 0;
 }
+
 extern int ceph_pagelist_release(struct ceph_pagelist *pl);
 
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
+
+extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
+
+extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
+
+extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+				     struct ceph_pagelist_cursor *c);
+
+extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+				  struct ceph_pagelist_cursor *c);
 
 static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
 {
diff --git a/fs/ceph/rados.h b/include/linux/ceph/rados.h
index 6d5247f2e81..6d5247f2e81 100644
--- a/fs/ceph/rados.h
+++ b/include/linux/ceph/rados.h
diff --git a/fs/ceph/types.h b/include/linux/ceph/types.h
index 28b35a005ec..28b35a005ec 100644
--- a/fs/ceph/types.h
+++ b/include/linux/ceph/types.h
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0c991023ee4..709dfb901d1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -75,7 +75,7 @@ struct cgroup_subsys_state {
 
 	unsigned long flags;
 	/* ID for this css, if possible */
-	struct css_id *id;
+	struct css_id __rcu *id;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
@@ -205,7 +205,7 @@ struct cgroup {
 	struct list_head children;	/* my children */
 
 	struct cgroup *parent;		/* my parent */
-	struct dentry *dentry;	  	/* cgroup fs entry, RCU protected */
+	struct dentry __rcu *dentry;	/* cgroup fs entry, RCU protected */
 
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c1a62c56a66..320d6c94ff8 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,7 +16,11 @@
 # define __release(x)	__context__(x,-1)
 # define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 # define __percpu	__attribute__((noderef, address_space(3)))
+#ifdef CONFIG_SPARSE_RCU_POINTER
+# define __rcu		__attribute__((noderef, address_space(4)))
+#else
 # define __rcu
+#endif
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
 #else
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4d2c39573f3..4aaeab37644 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,7 +84,7 @@ struct thread_group_cred {
 	atomic_t	usage;
 	pid_t		tgid;			/* thread group process ID */
 	spinlock_t	lock;
-	struct key	*session_keyring;	/* keyring inherited over fork */
+	struct key __rcu *session_keyring;	/* keyring inherited over fork */
 	struct key	*process_keyring;	/* keyring private to this process */
 	struct rcu_head	rcu;			/* RCU deletion hook */
 };
diff --git a/fs/ceph/crush/crush.h b/include/linux/crush/crush.h
index 97e435b191f..97e435b191f 100644
--- a/fs/ceph/crush/crush.h
+++ b/include/linux/crush/crush.h
diff --git a/fs/ceph/crush/hash.h b/include/linux/crush/hash.h
index 91e884230d5..91e884230d5 100644
--- a/fs/ceph/crush/hash.h
+++ b/include/linux/crush/hash.h
diff --git a/fs/ceph/crush/mapper.h b/include/linux/crush/mapper.h
index c46b99c18bb..c46b99c18bb 100644
--- a/fs/ceph/crush/mapper.h
+++ b/include/linux/crush/mapper.h
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 29b3ce3f2a1..2833452ea01 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -49,7 +49,6 @@ struct task_struct;
 
 #ifdef CONFIG_LOCKDEP
 extern void debug_show_all_locks(void);
-extern void __debug_show_held_locks(struct task_struct *task);
 extern void debug_show_held_locks(struct task_struct *task);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
 extern void debug_check_no_locks_held(struct task_struct *task);
@@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void)
 {
 }
 
-static inline void __debug_show_held_locks(struct task_struct *task)
-{
-}
-
 static inline void debug_show_held_locks(struct task_struct *task)
 {
 }
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 52c0da4bdd1..bef3cda44c4 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -1,6 +1,8 @@
 #ifndef _DYNAMIC_DEBUG_H
 #define _DYNAMIC_DEBUG_H
 
+#include <linux/jump_label.h>
+
 /* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
  * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
  * use independent hash functions, to reduce the chance of false positives.
@@ -22,8 +24,6 @@ struct _ddebug {
 	const char *function;
 	const char *filename;
 	const char *format;
-	char primary_hash;
-	char secondary_hash;
 	unsigned int lineno:24;
 	/*
  	 * The flags field controls the behaviour at the callsite.
@@ -33,6 +33,7 @@ struct _ddebug {
 #define _DPRINTK_FLAGS_PRINT   (1<<0)  /* printk() a message using the format */
 #define _DPRINTK_FLAGS_DEFAULT 0
 	unsigned int flags:8;
+	char enabled;
 } __attribute__((aligned(8)));
 
 
@@ -42,33 +43,35 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 #if defined(CONFIG_DYNAMIC_DEBUG)
 extern int ddebug_remove_module(const char *mod_name);
 
-#define __dynamic_dbg_enabled(dd)  ({	     \
-	int __ret = 0;							     \
-	if (unlikely((dynamic_debug_enabled & (1LL << DEBUG_HASH)) &&	     \
-			(dynamic_debug_enabled2 & (1LL << DEBUG_HASH2))))   \
-				if (unlikely(dd.flags))			     \
-					__ret = 1;			     \
-	__ret; })
-
 #define dynamic_pr_debug(fmt, ...) do {					\
+	__label__ do_printk;						\
+	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
-	{ KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,	\
-		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
-	if (__dynamic_dbg_enabled(descriptor))				\
-		printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);		\
+	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
+		_DPRINTK_FLAGS_DEFAULT };				\
+	JUMP_LABEL(&descriptor.enabled, do_printk);			\
+	goto out;							\
+do_printk:								\
+	printk(KERN_DEBUG pr_fmt(fmt),	##__VA_ARGS__);			\
+out:	;								\
 	} while (0)
 
 
 #define dynamic_dev_dbg(dev, fmt, ...) do {				\
+	__label__ do_printk;						\
+	__label__ out;							\
 	static struct _ddebug descriptor				\
 	__used								\
 	__attribute__((section("__verbose"), aligned(8))) =		\
-	{ KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,	\
-		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
-	if (__dynamic_dbg_enabled(descriptor))				\
-		dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
+	{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,		\
+		_DPRINTK_FLAGS_DEFAULT };				\
+	JUMP_LABEL(&descriptor.enabled, do_printk);			\
+	goto out;							\
+do_printk:								\
+	dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);		\
+out:	;								\
 	} while (0)
 
 #else
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f59ed297b66..133c0ba25e3 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,7 +31,7 @@ struct embedded_fd_set {
 
 struct fdtable {
 	unsigned int max_fds;
-	struct file ** fd;      /* current fd array */
+	struct file __rcu **fd;      /* current fd array */
 	fd_set *close_on_exec;
 	fd_set *open_fds;
 	struct rcu_head rcu;
@@ -46,7 +46,7 @@ struct files_struct {
    * read mostly part
    */
 	atomic_t count;
-	struct fdtable *fdt;
+	struct fdtable __rcu *fdt;
 	struct fdtable fdtab;
   /*
    * written part on a separate cache line in SMP
@@ -55,7 +55,7 @@ struct files_struct {
 	int next_fd;
 	struct embedded_fd_set close_on_exec_init;
 	struct embedded_fd_set open_fds_init;
-	struct file * fd_array[NR_OPEN_DEFAULT];
+	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
 };
 
 #define rcu_dereference_check_fdtable(files, fdtfd) \
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069bd80b..3168dcfb94f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1384,7 +1384,7 @@ struct super_block {
 	 * Saved mount options for lazy filesystems using
 	 * generic_show_options()
 	 */
-	char *s_options;
+	char __rcu *s_options;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 02b8b24f8f5..8beabb958f6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -191,8 +191,8 @@ struct ftrace_event_call {
 	unsigned int		flags;
 
 #ifdef CONFIG_PERF_EVENTS
-	int			perf_refcount;
-	struct hlist_head	*perf_events;
+	int				perf_refcount;
+	struct hlist_head __percpu	*perf_events;
 #endif
 };
 
@@ -252,8 +252,8 @@ DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
-extern int  perf_trace_enable(struct perf_event *event);
-extern void perf_trace_disable(struct perf_event *event);
+extern int  perf_trace_add(struct perf_event *event, int flags);
+extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4d8fb..af3f06b41dc 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter {
 struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
-	struct hd_struct *last_lookup;
-	struct hd_struct *part[];
+	struct hd_struct __rcu *last_lookup;
+	struct hd_struct __rcu *part[];
 };
 
 struct gendisk {
@@ -149,7 +149,7 @@ struct gendisk {
 	 * non-critical accesses use RCU.  Always access through
 	 * helpers.
 	 */
-	struct disk_part_tbl *part_tbl;
+	struct disk_part_tbl __rcu *part_tbl;
 	struct hd_struct part0;
 
 	const struct block_device_operations *fops;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d5b387669da..96c323ac44d 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,8 @@
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
 #define NMI_OFFSET	(1UL << NMI_SHIFT)
 
+#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+
 #ifndef PREEMPT_ACTIVE
 #define PREEMPT_ACTIVE_BITS	1
 #define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
@@ -82,10 +84,13 @@
 /*
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
+ * in_softirq - Are we currently processing softirq or have bh disabled?
+ * in_serving_softirq - Are we currently processing softirq?
  */
 #define in_irq()		(hardirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
+#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
 
 /*
  * Are we in NMI context?
@@ -132,14 +137,16 @@ extern void synchronize_irq(unsigned int irq);
 
 struct task_struct;
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
 static inline void account_system_vtime(struct task_struct *tsk)
 {
 }
+#else
+extern void account_system_vtime(struct task_struct *tsk);
 #endif
 
 #if defined(CONFIG_NO_HZ)
-#if defined(CONFIG_TINY_RCU)
+#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
 extern void rcu_enter_nohz(void);
 extern void rcu_exit_nohz(void);
 
diff --git a/include/linux/idr.h b/include/linux/idr.h
index e968db71e33..cdb715e58e3 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -50,14 +50,14 @@
 
 struct idr_layer {
 	unsigned long		 bitmap; /* A zero bit means "space here" */
-	struct idr_layer	*ary[1<<IDR_BITS];
+	struct idr_layer __rcu	*ary[1<<IDR_BITS];
 	int			 count;	 /* When zero, we can release it */
 	int			 layer;	 /* distance from leaf */
 	struct rcu_head		 rcu_head;
 };
 
 struct idr {
-	struct idr_layer *top;
+	struct idr_layer __rcu *top;
 	struct idr_layer *id_free;
 	int		  layers; /* only valid without concurrent changes */
 	int		  id_free_cnt;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f43fa56f60..2fea6c8ef6b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -82,11 +82,17 @@ extern struct group_info init_groups;
 # define CAP_INIT_BSET  CAP_FULL_SET
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
+#define INIT_TASK_RCU_TREE_PREEMPT()					\
+	.rcu_blocked_node = NULL,
+#else
+#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
 #define INIT_TASK_RCU_PREEMPT(tsk)					\
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special = 0,					\
-	.rcu_blocked_node = NULL,					\
-	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),
+	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
+	INIT_TASK_RCU_TREE_PREEMPT()
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
@@ -137,8 +143,8 @@ extern struct cred init_cred;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
-	.real_cred	= &init_cred,					\
-	.cred		= &init_cred,					\
+	RCU_INIT_POINTER(.real_cred, &init_cred),			\
+	RCU_INIT_POINTER(.cred, &init_cred),				\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(tsk.cred_guard_mutex),		\
 	.comm		= "swapper",					\
diff --git a/include/linux/input.h b/include/linux/input.h
index 896a92227bc..d6ae1761be9 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1196,7 +1196,7 @@ struct input_dev {
 	int (*flush)(struct input_dev *dev, struct file *file);
 	int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value);
 
-	struct input_handle *grab;
+	struct input_handle __rcu *grab;
 
 	spinlock_t event_lock;
 	struct mutex mutex;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a0384a4d1e6..531495db170 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
+#include <trace/events/irq.h>
 
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
@@ -407,7 +408,12 @@ asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+static inline void __raise_softirq_irqoff(unsigned int nr)
+{
+	trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL);
+	or_softirq_pending(1UL << nr);
+}
+
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 extern void wakeup_softirqd(void);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 64d52913303..3e70b21884a 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -53,7 +53,7 @@ struct io_context {
 
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
-	void *ioc_data;
+	void __rcu *ioc_data;
 };
 
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
new file mode 100644
index 00000000000..4fa09d4d0b7
--- /dev/null
+++ b/include/linux/irq_work.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_IRQ_WORK_H
+#define _LINUX_IRQ_WORK_H
+
+struct irq_work {
+	struct irq_work *next;
+	void (*func)(struct irq_work *);
+};
+
+static inline
+void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+{
+	entry->next = NULL;
+	entry->func = func;
+}
+
+bool irq_work_queue(struct irq_work *entry);
+void irq_work_run(void);
+void irq_work_sync(struct irq_work *entry);
+
+#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
new file mode 100644
index 00000000000..b67cb180e6e
--- /dev/null
+++ b/include/linux/jump_label.h
@@ -0,0 +1,74 @@
+#ifndef _LINUX_JUMP_LABEL_H
+#define _LINUX_JUMP_LABEL_H
+
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_HAVE_ARCH_JUMP_LABEL)
+# include <asm/jump_label.h>
+# define HAVE_JUMP_LABEL
+#endif
+
+enum jump_label_type {
+	JUMP_LABEL_ENABLE,
+	JUMP_LABEL_DISABLE
+};
+
+struct module;
+
+#ifdef HAVE_JUMP_LABEL
+
+extern struct jump_entry __start___jump_table[];
+extern struct jump_entry __stop___jump_table[];
+
+extern void arch_jump_label_transform(struct jump_entry *entry,
+				 enum jump_label_type type);
+extern void arch_jump_label_text_poke_early(jump_label_t addr);
+extern void jump_label_update(unsigned long key, enum jump_label_type type);
+extern void jump_label_apply_nops(struct module *mod);
+extern int jump_label_text_reserved(void *start, void *end);
+
+#define jump_label_enable(key) \
+	jump_label_update((unsigned long)key, JUMP_LABEL_ENABLE);
+
+#define jump_label_disable(key) \
+	jump_label_update((unsigned long)key, JUMP_LABEL_DISABLE);
+
+#else
+
+#define JUMP_LABEL(key, label)			\
+do {						\
+	if (unlikely(*key))			\
+		goto label;			\
+} while (0)
+
+#define jump_label_enable(cond_var)	\
+do {					\
+       *(cond_var) = 1;			\
+} while (0)
+
+#define jump_label_disable(cond_var)	\
+do {					\
+       *(cond_var) = 0;			\
+} while (0)
+
+static inline int jump_label_apply_nops(struct module *mod)
+{
+	return 0;
+}
+
+static inline int jump_label_text_reserved(void *start, void *end)
+{
+	return 0;
+}
+
+#endif
+
+#define COND_STMT(key, stmt)					\
+do {								\
+	__label__ jl_enabled;					\
+	JUMP_LABEL(key, jl_enabled);				\
+	if (0) {						\
+jl_enabled:							\
+		stmt;						\
+	}							\
+} while (0)
+
+#endif
diff --git a/include/linux/jump_label_ref.h b/include/linux/jump_label_ref.h
new file mode 100644
index 00000000000..e5d012ad92c
--- /dev/null
+++ b/include/linux/jump_label_ref.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_JUMP_LABEL_REF_H
+#define _LINUX_JUMP_LABEL_REF_H
+
+#include <linux/jump_label.h>
+#include <asm/atomic.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+static inline void jump_label_inc(atomic_t *key)
+{
+	if (atomic_add_return(1, key) == 1)
+		jump_label_enable(key);
+}
+
+static inline void jump_label_dec(atomic_t *key)
+{
+	if (atomic_dec_and_test(key))
+		jump_label_disable(key);
+}
+
+#else /* !HAVE_JUMP_LABEL */
+
+static inline void jump_label_inc(atomic_t *key)
+{
+	atomic_inc(key);
+}
+
+static inline void jump_label_dec(atomic_t *key)
+{
+	atomic_dec(key);
+}
+
+#undef JUMP_LABEL
+#define JUMP_LABEL(key, label)						\
+do {									\
+	if (unlikely(__builtin_choose_expr(				\
+	      __builtin_types_compatible_p(typeof(key), atomic_t *),	\
+	      atomic_read((atomic_t *)(key)), *(key))))			\
+		goto label;						\
+} while (0)
+
+#endif /* HAVE_JUMP_LABEL */
+
+#endif /* _LINUX_JUMP_LABEL_REF_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b0a35e6bc6..1759ba5adce 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,7 +58,18 @@ extern const char linux_proc_banner[];
 
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#define roundup(x, y) (					\
+{							\
+	typeof(y) __y = y;				\
+	(((x) + (__y - 1)) / __y) * __y;		\
+}							\
+)
+#define rounddown(x, y) (				\
+{							\
+	typeof(x) __x = (x);				\
+	__x - (__x % (y));				\
+}							\
+)
 #define DIV_ROUND_CLOSEST(x, divisor)(			\
 {							\
 	typeof(divisor) __divisor = divisor;		\
diff --git a/include/linux/key.h b/include/linux/key.h
index cd50dfa1d4c..3db0adce1fd 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -178,8 +178,9 @@ struct key {
 	 */
 	union {
 		unsigned long		value;
+		void __rcu		*rcudata;
 		void			*data;
-		struct keyring_list	*subscriptions;
+		struct keyring_list __rcu *subscriptions;
 	} payload;
 };
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c13cc48697a..ac740b26eb1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,7 +205,7 @@ struct kvm {
 
 	struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-	struct kvm_irq_routing_table *irq_routing;
+	struct kvm_irq_routing_table __rcu *irq_routing;
 	struct hlist_head mask_notifier_list;
 	struct hlist_head irq_ack_notifier_list;
 #endif
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 06aed8305bf..2186a64ee4b 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -32,6 +32,17 @@ extern int lock_stat;
 #define MAX_LOCKDEP_SUBCLASSES		8UL
 
 /*
+ * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
+ * cached in the instance of lockdep_map
+ *
+ * Currently main class (subclass == 0) and signle depth subclass
+ * are cached in lockdep_map. This optimization is mainly targeting
+ * on rq->lock. double_rq_lock() acquires this highly competitive with
+ * single depth.
+ */
+#define NR_LOCKDEP_CACHING_CLASSES	2
+
+/*
  * Lock-classes are keyed via unique addresses, by embedding the
  * lockclass-key into the kernel (or module) .data section. (For
  * static locks we use the lock address itself as the key.)
@@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class);
  */
 struct lockdep_map {
 	struct lock_class_key		*key;
-	struct lock_class		*class_cache;
+	struct lock_class		*class_cache[NR_LOCKDEP_CACHING_CLASSES];
 	const char			*name;
 #ifdef CONFIG_LOCK_STAT
 	int				cpu;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ee7e258627f..cb57d657ce4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -299,7 +299,7 @@ struct mm_struct {
 	 * new_owner->mm == mm
 	 * new_owner->alloc_lock is held
 	 */
-	struct task_struct *owner;
+	struct task_struct __rcu *owner;
 #endif
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/linux/module.h b/include/linux/module.h
index aace066bad8..b29e7458b96 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -350,7 +350,10 @@ struct module
 	struct tracepoint *tracepoints;
 	unsigned int num_tracepoints;
 #endif
-
+#ifdef HAVE_JUMP_LABEL
+	struct jump_entry *jump_entries;
+	unsigned int num_jump_entries;
+#endif
 #ifdef CONFIG_TRACING
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 9ed534c991b..70cd0603911 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -39,8 +39,9 @@ enum ctattr_type {
 	CTA_TUPLE_MASTER,
 	CTA_NAT_SEQ_ADJ_ORIG,
 	CTA_NAT_SEQ_ADJ_REPLY,
-	CTA_SECMARK,
+	CTA_SECMARK,		/* obsolete */
 	CTA_ZONE,
+	CTA_SECCTX,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
@@ -172,4 +173,11 @@ enum ctattr_help {
 };
 #define CTA_HELP_MAX (__CTA_HELP_MAX - 1)
 
+enum ctattr_secctx {
+	CTA_SECCTX_UNSPEC,
+	CTA_SECCTX_NAME,
+	__CTA_SECCTX_MAX
+};
+#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1)
+
 #endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h
index 6fcd3448b18..989092bd627 100644
--- a/include/linux/netfilter/xt_SECMARK.h
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -11,18 +11,12 @@
  * packets are being marked for.
  */
 #define SECMARK_MODE_SEL	0x01		/* SELinux */
-#define SECMARK_SELCTX_MAX	256
-
-struct xt_secmark_target_selinux_info {
-	__u32 selsid;
-	char selctx[SECMARK_SELCTX_MAX];
-};
+#define SECMARK_SECCTX_MAX	256
 
 struct xt_secmark_target_info {
 	__u8 mode;
-	union {
-		struct xt_secmark_target_selinux_info sel;
-	} u;
+	__u32 secid;
+	char secctx[SECMARK_SECCTX_MAX];
 };
 
 #endif /*_XT_SECMARK_H_target */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 508f8cf6da3..d0edf7d823a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,7 +185,7 @@ struct nfs_inode {
 	struct nfs4_cached_acl	*nfs4_acl;
         /* NFSv4 state */
 	struct list_head	open_states;
-	struct nfs_delegation	*delegation;
+	struct nfs_delegation __rcu *delegation;
 	fmode_t			 delegation_state;
 	struct rw_semaphore	rwsem;
 #endif /* CONFIG_NFS_V4*/
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index b2f1a4d8355..2026f9e1ceb 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -49,28 +49,28 @@
 
 struct notifier_block {
 	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
-	struct notifier_block *next;
+	struct notifier_block __rcu *next;
 	int priority;
 };
 
 struct atomic_notifier_head {
 	spinlock_t lock;
-	struct notifier_block *head;
+	struct notifier_block __rcu *head;
 };
 
 struct blocking_notifier_head {
 	struct rw_semaphore rwsem;
-	struct notifier_block *head;
+	struct notifier_block __rcu *head;
 };
 
 struct raw_notifier_head {
-	struct notifier_block *head;
+	struct notifier_block __rcu *head;
 };
 
 struct srcu_notifier_head {
 	struct mutex mutex;
 	struct srcu_struct srcu;
-	struct notifier_block *head;
+	struct notifier_block __rcu *head;
 };
 
 #define ATOMIC_INIT_NOTIFIER_HEAD(name) do {	\
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 5171639ecf0..32fb81212fd 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -15,6 +15,7 @@
 
 #include <linux/types.h>
 #include <linux/spinlock.h>
+#include <linux/init.h>
 #include <asm/atomic.h>
  
 /* Each escaped entry is prefixed by ESCAPE_CODE
@@ -185,4 +186,10 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val);
 int oprofile_add_data64(struct op_entry *entry, u64 val);
 int oprofile_write_commit(struct op_entry *entry);
 
+#ifdef CONFIG_PERF_EVENTS
+int __init oprofile_perf_init(struct oprofile_operations *ops);
+void oprofile_perf_exit(void);
+char *op_name_from_perf_id(void);
+#endif /* CONFIG_PERF_EVENTS */
+
 #endif /* OPROFILE_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 570fddeb038..2615c37c8fe 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -517,6 +517,7 @@
 #define PCI_DEVICE_ID_AMD_11H_NB_DRAM	0x1302
 #define PCI_DEVICE_ID_AMD_11H_NB_MISC	0x1303
 #define PCI_DEVICE_ID_AMD_11H_NB_LINK	0x1304
+#define PCI_DEVICE_ID_AMD_15H_NB_MISC	0x1603
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
 #define PCI_DEVICE_ID_AMD_SCSI		0x2020
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index ce2dc655cd1..27ef6b190ea 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -139,6 +139,15 @@
 	__aligned(PAGE_SIZE)
 
 /*
+ * Declaration/definition used for per-CPU variables that must be read mostly.
+ */
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\
+	DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
+
+#define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
+
+/*
  * Intermodule exports for per-CPU variables.  sparse forgets about
  * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to
  * noop if __CHECKER__.
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 49466b13c5c..0eb50832aa0 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -39,6 +39,15 @@
 	preempt_enable();				\
 } while (0)
 
+#define get_cpu_ptr(var) ({				\
+	preempt_disable();				\
+	this_cpu_ptr(var); })
+
+#define put_cpu_ptr(var) do {				\
+	(void)(var);					\
+	preempt_enable();				\
+} while (0)
+
 #ifdef CONFIG_SMP
 
 /* minimum unit size, also is the maximum supported allocation size */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 716f99b682c..057bf22a832 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -486,6 +486,8 @@ struct perf_guest_info_callbacks {
 #include <linux/workqueue.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/irq_work.h>
+#include <linux/jump_label_ref.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
 
@@ -529,16 +531,22 @@ struct hw_perf_event {
 			int		last_cpu;
 		};
 		struct { /* software */
-			s64		remaining;
 			struct hrtimer	hrtimer;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			struct arch_hw_breakpoint	info;
 			struct list_head		bp_list;
+			/*
+			 * Crufty hack to avoid the chicken and egg
+			 * problem hw_breakpoint has with context
+			 * creation and event initalization.
+			 */
+			struct task_struct		*bp_target;
 		};
 #endif
 	};
+	int				state;
 	local64_t			prev_count;
 	u64				sample_period;
 	u64				last_period;
@@ -550,6 +558,13 @@ struct hw_perf_event {
 #endif
 };
 
+/*
+ * hw_perf_event::state flags
+ */
+#define PERF_HES_STOPPED	0x01 /* the counter is stopped */
+#define PERF_HES_UPTODATE	0x02 /* event->count up-to-date */
+#define PERF_HES_ARCH		0x04
+
 struct perf_event;
 
 /*
@@ -561,36 +576,70 @@ struct perf_event;
  * struct pmu - generic performance monitoring unit
  */
 struct pmu {
-	int (*enable)			(struct perf_event *event);
-	void (*disable)			(struct perf_event *event);
-	int (*start)			(struct perf_event *event);
-	void (*stop)			(struct perf_event *event);
-	void (*read)			(struct perf_event *event);
-	void (*unthrottle)		(struct perf_event *event);
+	struct list_head		entry;
+
+	int * __percpu			pmu_disable_count;
+	struct perf_cpu_context * __percpu pmu_cpu_context;
+	int				task_ctx_nr;
+
+	/*
+	 * Fully disable/enable this PMU, can be used to protect from the PMI
+	 * as well as for lazy/batch writing of the MSRs.
+	 */
+	void (*pmu_enable)		(struct pmu *pmu); /* optional */
+	void (*pmu_disable)		(struct pmu *pmu); /* optional */
 
 	/*
-	 * Group events scheduling is treated as a transaction, add group
-	 * events as a whole and perform one schedulability test. If the test
-	 * fails, roll back the whole group
+	 * Try and initialize the event for this PMU.
+	 * Should return -ENOENT when the @event doesn't match this PMU.
 	 */
+	int (*event_init)		(struct perf_event *event);
+
+#define PERF_EF_START	0x01		/* start the counter when adding    */
+#define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
+#define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
 
 	/*
-	 * Start the transaction, after this ->enable() doesn't need
-	 * to do schedulability tests.
+	 * Adds/Removes a counter to/from the PMU, can be done inside
+	 * a transaction, see the ->*_txn() methods.
 	 */
-	void (*start_txn)	(const struct pmu *pmu);
+	int  (*add)			(struct perf_event *event, int flags);
+	void (*del)			(struct perf_event *event, int flags);
+
 	/*
-	 * If ->start_txn() disabled the ->enable() schedulability test
+	 * Starts/Stops a counter present on the PMU. The PMI handler
+	 * should stop the counter when perf_event_overflow() returns
+	 * !0. ->start() will be used to continue.
+	 */
+	void (*start)			(struct perf_event *event, int flags);
+	void (*stop)			(struct perf_event *event, int flags);
+
+	/*
+	 * Updates the counter value of the event.
+	 */
+	void (*read)			(struct perf_event *event);
+
+	/*
+	 * Group events scheduling is treated as a transaction, add
+	 * group events as a whole and perform one schedulability test.
+	 * If the test fails, roll back the whole group
+	 *
+	 * Start the transaction, after this ->add() doesn't need to
+	 * do schedulability tests.
+	 */
+	void (*start_txn)	(struct pmu *pmu); /* optional */
+	/*
+	 * If ->start_txn() disabled the ->add() schedulability test
 	 * then ->commit_txn() is required to perform one. On success
 	 * the transaction is closed. On error the transaction is kept
 	 * open until ->cancel_txn() is called.
 	 */
-	int  (*commit_txn)	(const struct pmu *pmu);
+	int  (*commit_txn)	(struct pmu *pmu); /* optional */
 	/*
-	 * Will cancel the transaction, assumes ->disable() is called for
-	 * each successfull ->enable() during the transaction.
+	 * Will cancel the transaction, assumes ->del() is called
+	 * for each successfull ->add() during the transaction.
 	 */
-	void (*cancel_txn)	(const struct pmu *pmu);
+	void (*cancel_txn)	(struct pmu *pmu); /* optional */
 };
 
 /**
@@ -631,11 +680,6 @@ struct perf_buffer {
 	void				*data_pages[0];
 };
 
-struct perf_pending_entry {
-	struct perf_pending_entry *next;
-	void (*func)(struct perf_pending_entry *);
-};
-
 struct perf_sample_data;
 
 typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -656,6 +700,7 @@ struct swevent_hlist {
 
 #define PERF_ATTACH_CONTEXT	0x01
 #define PERF_ATTACH_GROUP	0x02
+#define PERF_ATTACH_TASK	0x04
 
 /**
  * struct perf_event - performance event kernel representation:
@@ -669,7 +714,7 @@ struct perf_event {
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
-	const struct pmu		*pmu;
+	struct pmu			*pmu;
 
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
@@ -743,7 +788,7 @@ struct perf_event {
 	int				pending_wakeup;
 	int				pending_kill;
 	int				pending_disable;
-	struct perf_pending_entry	pending;
+	struct irq_work			pending;
 
 	atomic_t			event_limit;
 
@@ -763,12 +808,19 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
+enum perf_event_context_type {
+	task_context,
+	cpu_context,
+};
+
 /**
  * struct perf_event_context - event context structure
  *
  * Used as a container for task events and CPU events as well:
  */
 struct perf_event_context {
+	enum perf_event_context_type	type;
+	struct pmu			*pmu;
 	/*
 	 * Protect the states of the events in the list,
 	 * nr_active, and the list:
@@ -808,6 +860,12 @@ struct perf_event_context {
 	struct rcu_head			rcu_head;
 };
 
+/*
+ * Number of contexts where an event can trigger:
+ * 	task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS	4
+
 /**
  * struct perf_event_cpu_context - per cpu event context structure
  */
@@ -815,18 +873,9 @@ struct perf_cpu_context {
 	struct perf_event_context	ctx;
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
-	int				max_pertask;
 	int				exclusive;
-	struct swevent_hlist		*swevent_hlist;
-	struct mutex			hlist_mutex;
-	int				hlist_refcount;
-
-	/*
-	 * Recursion avoidance:
-	 *
-	 * task, softirq, irq, nmi context
-	 */
-	int				recursion[4];
+	struct list_head		rotation_list;
+	int				jiffies_interval;
 };
 
 struct perf_output_handle {
@@ -842,26 +891,34 @@ struct perf_output_handle {
 
 #ifdef CONFIG_PERF_EVENTS
 
-/*
- * Set by architecture code:
- */
-extern int perf_max_events;
+extern int perf_pmu_register(struct pmu *pmu);
+extern void perf_pmu_unregister(struct pmu *pmu);
+
+extern int perf_num_counters(void);
+extern const char *perf_pmu_name(void);
+extern void __perf_event_task_sched_in(struct task_struct *task);
+extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
 
-extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+extern atomic_t perf_task_events;
+
+static inline void perf_event_task_sched_in(struct task_struct *task)
+{
+	COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
+}
+
+static inline
+void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
+{
+	COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
+}
 
-extern void perf_event_task_sched_in(struct task_struct *task);
-extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
-extern void perf_event_task_tick(struct task_struct *task);
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
+extern void perf_event_delayed_put(struct task_struct *task);
 extern void perf_event_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
+extern void perf_pmu_disable(struct pmu *pmu);
+extern void perf_pmu_enable(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 extern void perf_event_update_userpage(struct perf_event *event);
@@ -869,7 +926,7 @@ extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
-				pid_t pid,
+				struct task_struct *task,
 				perf_overflow_handler_t callback);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
@@ -920,14 +977,7 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
  */
 static inline int is_software_event(struct perf_event *event)
 {
-	switch (event->attr.type) {
-	case PERF_TYPE_SOFTWARE:
-	case PERF_TYPE_TRACEPOINT:
-	/* for now the breakpoint stuff also works as software event */
-	case PERF_TYPE_BREAKPOINT:
-		return 1;
-	}
-	return 0;
+	return event->pmu->task_ctx_nr == perf_sw_context;
 }
 
 extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -954,18 +1004,20 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
-static inline void
+static __always_inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	if (atomic_read(&perf_swevent_enabled[event_id])) {
-		struct pt_regs hot_regs;
-
-		if (!regs) {
-			perf_fetch_caller_regs(&hot_regs);
-			regs = &hot_regs;
-		}
-		__perf_sw_event(event_id, nr, nmi, regs, addr);
+	struct pt_regs hot_regs;
+
+	JUMP_LABEL(&perf_swevent_enabled[event_id], have_event);
+	return;
+
+have_event:
+	if (!regs) {
+		perf_fetch_caller_regs(&hot_regs);
+		regs = &hot_regs;
 	}
+	__perf_sw_event(event_id, nr, nmi, regs, addr);
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -976,7 +1028,21 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+/* Callchains */
+DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
+
+extern void perf_callchain_user(struct perf_callchain_entry *entry,
+				struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
+				  struct pt_regs *regs);
+
+
+static inline void
+perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+	if (entry->nr < PERF_MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
 
 extern int sysctl_perf_event_paranoid;
 extern int sysctl_perf_event_mlock;
@@ -1019,21 +1085,18 @@ extern int perf_swevent_get_recursion_context(void);
 extern void perf_swevent_put_recursion_context(int rctx);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
+extern void perf_event_task_tick(void);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task)			{ }
 static inline void
 perf_event_task_sched_out(struct task_struct *task,
 			    struct task_struct *next)			{ }
-static inline void
-perf_event_task_tick(struct task_struct *task)				{ }
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
-static inline void perf_event_do_pending(void)				{ }
+static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline void perf_event_print_debug(void)				{ }
-static inline void perf_disable(void)					{ }
-static inline void perf_enable(void)					{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 
@@ -1056,6 +1119,7 @@ static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
 static inline void perf_swevent_put_recursion_context(int rctx)		{ }
 static inline void perf_event_enable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
+static inline void perf_event_task_tick(void)				{ }
 #endif
 
 #define perf_output_put(handle, x) \
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 634b8e674ac..a39cbed9ee1 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr)
 {
 	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
 }
+#define radix_tree_indirect_to_ptr(ptr) \
+	radix_tree_indirect_to_ptr((void __force *)(ptr))
 
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
@@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 struct radix_tree_root {
 	unsigned int		height;
 	gfp_t			gfp_mask;
-	struct radix_tree_node	*rnode;
+	struct radix_tree_node	__rcu *rnode;
 };
 
 #define RADIX_TREE_INIT(mask)	{					\
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4ec3b38ce9c..f31ef61f1c6 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -10,6 +10,21 @@
 #include <linux/rcupdate.h>
 
 /*
+ * Why is there no list_empty_rcu()?  Because list_empty() serves this
+ * purpose.  The list_empty() function fetches the RCU-protected pointer
+ * and compares it to the address of the list head, but neither dereferences
+ * this pointer itself nor provides this pointer to the caller.  Therefore,
+ * it is not necessary to use rcu_dereference(), so that list_empty() can
+ * be used anywhere you would want to use a list_empty_rcu().
+ */
+
+/*
+ * return the ->next pointer of a list_head in an rcu safe
+ * way, we must not access it directly
+ */
+#define list_next_rcu(list)	(*((struct list_head __rcu **)(&(list)->next)))
+
+/*
  * Insert a new entry between two known consecutive entries.
  *
  * This is only for internal list manipulation where we know
@@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new,
 {
 	new->next = next;
 	new->prev = prev;
-	rcu_assign_pointer(prev->next, new);
+	rcu_assign_pointer(list_next_rcu(prev), new);
 	next->prev = new;
 }
 
@@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old,
 {
 	new->next = old->next;
 	new->prev = old->prev;
-	rcu_assign_pointer(new->prev->next, new);
+	rcu_assign_pointer(list_next_rcu(new->prev), new);
 	new->next->prev = new;
 	old->prev = LIST_POISON2;
 }
@@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	 */
 
 	last->next = at;
-	rcu_assign_pointer(head->next, first);
+	rcu_assign_pointer(list_next_rcu(head), first);
 	first->prev = head;
 	at->prev = last;
 }
@@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
  */
 #define list_entry_rcu(ptr, type, member) \
-	container_of(rcu_dereference_raw(ptr), type, member)
+	({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \
+	 container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
+	})
 
 /**
  * list_first_entry_rcu - get the first element from a list
@@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	list_entry_rcu((ptr)->next, type, member)
 
 #define __list_for_each_rcu(pos, head) \
-	for (pos = rcu_dereference_raw((head)->next); \
+	for (pos = rcu_dereference_raw(list_next_rcu(head)); \
 		pos != (head); \
-		pos = rcu_dereference_raw(pos->next))
+		pos = rcu_dereference_raw(list_next_rcu((pos)))
 
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
@@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_continue_rcu(pos, head) \
-	for ((pos) = rcu_dereference_raw((pos)->next); \
+	for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
 		prefetch((pos)->next), (pos) != (head); \
-		(pos) = rcu_dereference_raw((pos)->next))
+		(pos) = rcu_dereference_raw(list_next_rcu(pos)))
 
 /**
  * list_for_each_entry_continue_rcu - continue iteration over list of given type
@@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
 
 	new->next = next;
 	new->pprev = old->pprev;
-	rcu_assign_pointer(*new->pprev, new);
+	rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
 	if (next)
 		new->next->pprev = &new->next;
 	old->pprev = LIST_POISON2;
 }
 
+/*
+ * return the first or the next element in an RCU protected hlist
+ */
+#define hlist_first_rcu(head)	(*((struct hlist_node __rcu **)(&(head)->first)))
+#define hlist_next_rcu(node)	(*((struct hlist_node __rcu **)(&(node)->next)))
+#define hlist_pprev_rcu(node)	(*((struct hlist_node __rcu **)((node)->pprev)))
+
 /**
  * hlist_add_head_rcu
  * @n: the element to add to the hash list.
@@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
 
 	n->next = first;
 	n->pprev = &h->first;
-	rcu_assign_pointer(h->first, n);
+	rcu_assign_pointer(hlist_first_rcu(h), n);
 	if (first)
 		first->pprev = &n->next;
 }
@@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
 {
 	n->pprev = next->pprev;
 	n->next = next;
-	rcu_assign_pointer(*(n->pprev), n);
+	rcu_assign_pointer(hlist_pprev_rcu(n), n);
 	next->pprev = &n->next;
 }
 
@@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 {
 	n->next = prev->next;
 	n->pprev = &prev->next;
-	rcu_assign_pointer(prev->next, n);
+	rcu_assign_pointer(hlist_next_rcu(prev), n);
 	if (n->next)
 		n->next->pprev = &n->next;
 }
 
-#define __hlist_for_each_rcu(pos, head)			\
-	for (pos = rcu_dereference((head)->first);	\
-	     pos && ({ prefetch(pos->next); 1; });	\
-	     pos = rcu_dereference(pos->next))
+#define __hlist_for_each_rcu(pos, head)				\
+	for (pos = rcu_dereference(hlist_first_rcu(head));	\
+	     pos && ({ prefetch(pos->next); 1; });		\
+	     pos = rcu_dereference(hlist_next_rcu(pos)))
 
 /**
  * hlist_for_each_entry_rcu - iterate over rcu list of given type
@@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
  * the _rcu list-mutation primitives such as hlist_add_head_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define hlist_for_each_entry_rcu(tpos, pos, head, member)		 \
-	for (pos = rcu_dereference_raw((head)->first);			 \
+#define hlist_for_each_entry_rcu(tpos, pos, head, member)		\
+	for (pos = rcu_dereference_raw(hlist_first_rcu(head));		\
 		pos && ({ prefetch(pos->next); 1; }) &&			 \
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference_raw(pos->next))
+		pos = rcu_dereference_raw(hlist_next_rcu(pos)))
 
 /**
  * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index b70ffe53cb9..2ae13714828 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
 	}
 }
 
+#define hlist_nulls_first_rcu(head) \
+	(*((struct hlist_nulls_node __rcu __force **)&(head)->first))
+
+#define hlist_nulls_next_rcu(node) \
+	(*((struct hlist_nulls_node __rcu __force **)&(node)->next))
+
 /**
  * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
@@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
 
 	n->next = first;
 	n->pprev = &h->first;
-	rcu_assign_pointer(h->first, n);
+	rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
 	if (!is_a_nulls(first))
 		first->pprev = &n->next;
 }
@@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  * @member:	the name of the hlist_nulls_node within the struct.
  *
  */
-#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
-	for (pos = rcu_dereference_raw((head)->first);			 \
-		(!is_a_nulls(pos)) &&			\
+#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)			\
+	for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));		\
+		(!is_a_nulls(pos)) &&						\
 		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference_raw(pos->next))
+		pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
 
 #endif
 #endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 83af1f8d8b7..03cda7bed98 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,11 +41,15 @@
 #include <linux/lockdep.h>
 #include <linux/completion.h>
 #include <linux/debugobjects.h>
+#include <linux/compiler.h>
 
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
+#define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
+
 /**
  * struct rcu_head - callback structure for use with RCU
  * @next: next update requests in a list
@@ -57,29 +61,94 @@ struct rcu_head {
 };
 
 /* Exported common interfaces */
-extern void rcu_barrier(void);
+extern void call_rcu_sched(struct rcu_head *head,
+			   void (*func)(struct rcu_head *rcu));
+extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
 extern void synchronize_sched_expedited(void);
 extern int sched_expedited_torture_stats(char *page);
 
+static inline void __rcu_read_lock_bh(void)
+{
+	local_bh_disable();
+}
+
+static inline void __rcu_read_unlock_bh(void)
+{
+	local_bh_enable();
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+void synchronize_rcu(void);
+
+/*
+ * Defined as a macro as it is a very low level header included from
+ * areas that don't even know about current.  This gives the rcu_read_lock()
+ * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
+ * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
+ */
+#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+static inline void __rcu_read_lock(void)
+{
+	preempt_disable();
+}
+
+static inline void __rcu_read_unlock(void)
+{
+	preempt_enable();
+}
+
+static inline void synchronize_rcu(void)
+{
+	synchronize_sched();
+}
+
+static inline int rcu_preempt_depth(void)
+{
+	return 0;
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
 /* Internal to kernel */
 extern void rcu_init(void);
+extern void rcu_sched_qs(int cpu);
+extern void rcu_bh_qs(int cpu);
+extern void rcu_check_callbacks(int cpu, int user);
+struct notifier_block;
+
+#ifdef CONFIG_NO_HZ
+
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static inline void rcu_enter_nohz(void)
+{
+}
+
+static inline void rcu_exit_nohz(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
-#elif defined(CONFIG_TINY_RCU)
+#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
 #include <linux/rcutiny.h>
 #else
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
 
-#define RCU_HEAD_INIT	{ .next = NULL, .func = NULL }
-#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
-#define INIT_RCU_HEAD(ptr) do { \
-       (ptr)->next = NULL; (ptr)->func = NULL; \
-} while (0)
-
 /*
  * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
  * initialization and destruction of rcu_head on the stack. rcu_head structures
@@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map;
 extern int debug_lockdep_rcu_enabled(void);
 
 /**
- * rcu_read_lock_held - might we be in RCU read-side critical section?
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
  *
  * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
  * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
  * this assumes we are in an RCU read-side critical section unless it can
- * prove otherwise.
+ * prove otherwise.  This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
  *
- * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
  */
 static inline int rcu_read_lock_held(void)
@@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void)
 extern int rcu_read_lock_bh_held(void);
 
 /**
- * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
  *
  * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
  * RCU-sched read-side critical section.  In absence of
  * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
  * critical section unless it can prove otherwise.  Note that disabling
  * of preemption (including disabling irqs) counts as an RCU-sched
- * read-side critical section.
+ * read-side critical section.  This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
  *
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
@@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void)
 
 extern int rcu_my_thread_group_empty(void);
 
-#define __do_rcu_dereference_check(c)					\
+/**
+ * rcu_lockdep_assert - emit lockdep splat if specified condition not met
+ * @c: condition to check
+ */
+#define rcu_lockdep_assert(c)						\
 	do {								\
 		static bool __warned;					\
 		if (debug_lockdep_rcu_enabled() && !__warned && !(c)) {	\
@@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void);
 		}							\
 	} while (0)
 
+#else /* #ifdef CONFIG_PROVE_RCU */
+
+#define rcu_lockdep_assert(c) do { } while (0)
+
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
+/*
+ * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
+ * and rcu_assign_pointer().  Some of these could be folded into their
+ * callers, but they are left separate in order to ease introduction of
+ * multiple flavors of pointers to match the multiple flavors of RCU
+ * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in
+ * the future.
+ */
+
+#ifdef __CHECKER__
+#define rcu_dereference_sparse(p, space) \
+	((void)(((typeof(*p) space *)p) == p))
+#else /* #ifdef __CHECKER__ */
+#define rcu_dereference_sparse(p, space)
+#endif /* #else #ifdef __CHECKER__ */
+
+#define __rcu_access_pointer(p, space) \
+	({ \
+		typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+		rcu_dereference_sparse(p, space); \
+		((typeof(*p) __force __kernel *)(_________p1)); \
+	})
+#define __rcu_dereference_check(p, c, space) \
+	({ \
+		typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+		rcu_lockdep_assert(c); \
+		rcu_dereference_sparse(p, space); \
+		smp_read_barrier_depends(); \
+		((typeof(*p) __force __kernel *)(_________p1)); \
+	})
+#define __rcu_dereference_protected(p, c, space) \
+	({ \
+		rcu_lockdep_assert(c); \
+		rcu_dereference_sparse(p, space); \
+		((typeof(*p) __force __kernel *)(p)); \
+	})
+
+#define __rcu_dereference_index_check(p, c) \
+	({ \
+		typeof(p) _________p1 = ACCESS_ONCE(p); \
+		rcu_lockdep_assert(c); \
+		smp_read_barrier_depends(); \
+		(_________p1); \
+	})
+#define __rcu_assign_pointer(p, v, space) \
+	({ \
+		if (!__builtin_constant_p(v) || \
+		    ((v) != NULL)) \
+			smp_wmb(); \
+		(p) = (typeof(*v) __force space *)(v); \
+	})
+
+
+/**
+ * rcu_access_pointer() - fetch RCU pointer with no dereferencing
+ * @p: The pointer to read
+ *
+ * Return the value of the specified RCU-protected pointer, but omit the
+ * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ * when the value of this pointer is accessed, but the pointer is not
+ * dereferenced, for example, when testing an RCU-protected pointer against
+ * NULL.  Although rcu_access_pointer() may also be used in cases where
+ * update-side locks prevent the value of the pointer from changing, you
+ * should instead use rcu_dereference_protected() for this use case.
+ */
+#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
+
 /**
- * rcu_dereference_check - rcu_dereference with debug checking
+ * rcu_dereference_check() - rcu_dereference with debug checking
  * @p: The pointer to read, prior to dereferencing
  * @c: The conditions under which the dereference will take place
  *
  * Do an rcu_dereference(), but check that the conditions under which the
- * dereference will take place are correct.  Typically the conditions indicate
- * the various locking conditions that should be held at that point.  The check
- * should return true if the conditions are satisfied.
+ * dereference will take place are correct.  Typically the conditions
+ * indicate the various locking conditions that should be held at that
+ * point.  The check should return true if the conditions are satisfied.
+ * An implicit check for being in an RCU read-side critical section
+ * (rcu_read_lock()) is included.
  *
  * For example:
  *
- *	bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- *					      lockdep_is_held(&foo->lock));
+ *	bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
  *
  * could be used to indicate to lockdep that foo->bar may only be dereferenced
- * if either the RCU read lock is held, or that the lock required to replace
+ * if either rcu_read_lock() is held, or that the lock required to replace
  * the bar struct at foo->bar is held.
  *
  * Note that the list of conditions may also include indications of when a lock
  * need not be held, for example during initialisation or destruction of the
  * target struct:
  *
- *	bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- *					      lockdep_is_held(&foo->lock) ||
+ *	bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
  *					      atomic_read(&foo->usage) == 0);
+ *
+ * Inserts memory barriers on architectures that require them
+ * (currently only the Alpha), prevents the compiler from refetching
+ * (and from merging fetches), and, more importantly, documents exactly
+ * which pointers are protected by RCU and checks that the pointer is
+ * annotated as __rcu.
  */
 #define rcu_dereference_check(p, c) \
-	({ \
-		__do_rcu_dereference_check(c); \
-		rcu_dereference_raw(p); \
-	})
+	__rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu)
+
+/**
+ * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_bh_check(p, c) \
+	__rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu)
 
 /**
- * rcu_dereference_protected - fetch RCU pointer when updates prevented
+ * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_sched_check(p, c) \
+	__rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \
+				__rcu)
+
+#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
+
+/**
+ * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * Similar to rcu_dereference_check(), but omits the sparse checking.
+ * This allows rcu_dereference_index_check() to be used on integers,
+ * which can then be used as array indices.  Attempting to use
+ * rcu_dereference_check() on an integer will give compiler warnings
+ * because the sparse address-space mechanism relies on dereferencing
+ * the RCU-protected pointer.  Dereferencing integers is not something
+ * that even gcc will put up with.
+ *
+ * Note that this function does not implicitly check for RCU read-side
+ * critical sections.  If this function gains lots of uses, it might
+ * make sense to provide versions for each flavor of RCU, but it does
+ * not make sense as of early 2010.
+ */
+#define rcu_dereference_index_check(p, c) \
+	__rcu_dereference_index_check((p), (c))
+
+/**
+ * rcu_dereference_protected() - fetch RCU pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
  *
  * Return the value of the specified RCU-protected pointer, but omit
  * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
@@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void);
  * prevent the compiler from repeating this reference or combining it
  * with other references, so it should not be used without protection
  * of appropriate locks.
+ *
+ * This function is only for update-side use.  Using this function
+ * when protected only by rcu_read_lock() will result in infrequent
+ * but very ugly failures.
  */
 #define rcu_dereference_protected(p, c) \
-	({ \
-		__do_rcu_dereference_check(c); \
-		(p); \
-	})
+	__rcu_dereference_protected((p), (c), __rcu)
 
-#else /* #ifdef CONFIG_PROVE_RCU */
+/**
+ * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_bh_protected(p, c) \
+	__rcu_dereference_protected((p), (c), __rcu)
 
-#define rcu_dereference_check(p, c)	rcu_dereference_raw(p)
-#define rcu_dereference_protected(p, c) (p)
+/**
+ * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_sched_protected(p, c) \
+	__rcu_dereference_protected((p), (c), __rcu)
 
-#endif /* #else #ifdef CONFIG_PROVE_RCU */
 
 /**
- * rcu_access_pointer - fetch RCU pointer with no dereferencing
+ * rcu_dereference() - fetch RCU-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
  *
- * Return the value of the specified RCU-protected pointer, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
- * when the value of this pointer is accessed, but the pointer is not
- * dereferenced, for example, when testing an RCU-protected pointer against
- * NULL.  This may also be used in cases where update-side locks prevent
- * the value of the pointer from changing, but rcu_dereference_protected()
- * is a lighter-weight primitive for this use case.
+ * This is a simple wrapper around rcu_dereference_check().
+ */
+#define rcu_dereference(p) rcu_dereference_check(p, 0)
+
+/**
+ * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
+ */
+#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)
+
+/**
+ * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
  */
-#define rcu_access_pointer(p)	ACCESS_ONCE(p)
+#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
 
 /**
- * rcu_read_lock - mark the beginning of an RCU read-side critical section.
+ * rcu_read_lock() - mark the beginning of an RCU read-side critical section
  *
  * When synchronize_rcu() is invoked on one CPU while other CPUs
  * are within RCU read-side critical sections, then the
@@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void);
  * until after the all the other CPUs exit their critical sections.
  *
  * Note, however, that RCU callbacks are permitted to run concurrently
- * with RCU read-side critical sections.  One way that this can happen
+ * with new RCU read-side critical sections.  One way that this can happen
  * is via the following sequence of events: (1) CPU 0 enters an RCU
  * read-side critical section, (2) CPU 1 invokes call_rcu() to register
  * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
@@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void);
  * will be deferred until the outermost RCU read-side critical section
  * completes.
  *
- * It is illegal to block while in an RCU read-side critical section.
+ * You can avoid reading and understanding the next paragraph by
+ * following this rule: don't put anything in an rcu_read_lock() RCU
+ * read-side critical section that would block in a !PREEMPT kernel.
+ * But if you want the full story, read on!
+ *
+ * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it
+ * is illegal to block while in an RCU read-side critical section.  In
+ * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
+ * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
+ * be preempted, but explicit blocking is illegal.  Finally, in preemptible
+ * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
+ * RCU read-side critical sections may be preempted and they may also
+ * block, but only when acquiring spinlocks that are subject to priority
+ * inheritance.
  */
 static inline void rcu_read_lock(void)
 {
@@ -337,7 +574,7 @@ static inline void rcu_read_lock(void)
  */
 
 /**
- * rcu_read_unlock - marks the end of an RCU read-side critical section.
+ * rcu_read_unlock() - marks the end of an RCU read-side critical section.
  *
  * See rcu_read_lock() for more information.
  */
@@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void)
 }
 
 /**
- * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
+ * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
  *
  * This is equivalent of rcu_read_lock(), but to be used when updates
- * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks
- * consider completion of a softirq handler to be a quiescent state,
- * a process in RCU read-side critical section must be protected by
- * disabling softirqs. Read-side critical sections in interrupt context
- * can use just rcu_read_lock().
- *
+ * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since
+ * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a
+ * softirq handler to be a quiescent state, a process in RCU read-side
+ * critical section must be protected by disabling softirqs. Read-side
+ * critical sections in interrupt context can use just rcu_read_lock(),
+ * though this should at least be commented to avoid confusing people
+ * reading the code.
  */
 static inline void rcu_read_lock_bh(void)
 {
@@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void)
 }
 
 /**
- * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section
+ * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
  *
- * Should be used with either
- * - synchronize_sched()
- * or
- * - call_rcu_sched() and rcu_barrier_sched()
- * on the write-side to insure proper synchronization.
+ * This is equivalent of rcu_read_lock(), but to be used when updates
+ * are being done using call_rcu_sched() or synchronize_rcu_sched().
+ * Read-side critical sections can also be introduced by anything that
+ * disables preemption, including local_irq_disable() and friends.
  */
 static inline void rcu_read_lock_sched(void)
 {
@@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 	preempt_enable_notrace();
 }
 
-
 /**
- * rcu_dereference_raw - fetch an RCU-protected pointer
+ * rcu_assign_pointer() - assign to RCU-protected pointer
+ * @p: pointer to assign to
+ * @v: value to assign (publish)
  *
- * The caller must be within some flavor of RCU read-side critical
- * section, or must be otherwise preventing the pointer from changing,
- * for example, by holding an appropriate lock.  This pointer may later
- * be safely dereferenced.  It is the caller's responsibility to have
- * done the right thing, as this primitive does no checking of any kind.
- *
- * Inserts memory barriers on architectures that require them
- * (currently only the Alpha), and, more importantly, documents
- * exactly which pointers are protected by RCU.
- */
-#define rcu_dereference_raw(p)	({ \
-				typeof(p) _________p1 = ACCESS_ONCE(p); \
-				smp_read_barrier_depends(); \
-				(_________p1); \
-				})
-
-/**
- * rcu_dereference - fetch an RCU-protected pointer, checking for RCU
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference(p) \
-	rcu_dereference_check(p, rcu_read_lock_held())
-
-/**
- * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_bh(p) \
-		rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled())
-
-/**
- * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_sched(p) \
-		rcu_dereference_check(p, rcu_read_lock_sched_held())
-
-/**
- * rcu_assign_pointer - assign (publicize) a pointer to a newly
- * initialized structure that will be dereferenced by RCU read-side
- * critical sections.  Returns the value assigned.
+ * Assigns the specified value to the specified RCU-protected
+ * pointer, ensuring that any concurrent RCU readers will see
+ * any prior initialization.  Returns the value assigned.
  *
  * Inserts memory barriers on architectures that require them
  * (pretty much all of them other than x86), and also prevents
@@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * call documents which pointers will be dereferenced by RCU read-side
  * code.
  */
-
 #define rcu_assign_pointer(p, v) \
-	({ \
-		if (!__builtin_constant_p(v) || \
-		    ((v) != NULL)) \
-			smp_wmb(); \
-		(p) = (v); \
-	})
+	__rcu_assign_pointer((p), (v), __rcu)
+
+/**
+ * RCU_INIT_POINTER() - initialize an RCU protected pointer
+ *
+ * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep
+ * splats.
+ */
+#define RCU_INIT_POINTER(p, v) \
+		p = (typeof(*v) __force __rcu *)(v)
 
 /* Infrastructure to implement the synchronize_() primitives. */
 
@@ -494,26 +694,37 @@ struct rcu_synchronize {
 
 extern void wakeme_after_rcu(struct rcu_head  *head);
 
+#ifdef CONFIG_PREEMPT_RCU
+
 /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.  RCU read-side critical
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
 extern void call_rcu(struct rcu_head *head,
 			      void (*func)(struct rcu_head *head));
 
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/* In classic RCU, call_rcu() is just call_rcu_sched(). */
+#define	call_rcu	call_rcu_sched
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
 /**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
  *
- * The update function will be invoked some time after a full grace
+ * The callback function will be invoked some time after a full grace
  * period elapses, in other words after all currently executing RCU
  * read-side critical sections have completed. call_rcu_bh() assumes
  * that the read-side critical sections end on completion of a softirq
@@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
-#ifndef CONFIG_PROVE_RCU
-#define __do_rcu_dereference_check(c) do { } while (0)
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
-#define __rcu_dereference_index_check(p, c) \
-	({ \
-		typeof(p) _________p1 = ACCESS_ONCE(p); \
-		__do_rcu_dereference_check(c); \
-		smp_read_barrier_depends(); \
-		(_________p1); \
-	})
-
-/**
- * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * Similar to rcu_dereference_check(), but omits the sparse checking.
- * This allows rcu_dereference_index_check() to be used on integers,
- * which can then be used as array indices.  Attempting to use
- * rcu_dereference_check() on an integer will give compiler warnings
- * because the sparse address-space mechanism relies on dereferencing
- * the RCU-protected pointer.  Dereferencing integers is not something
- * that even gcc will put up with.
- *
- * Note that this function does not implicitly check for RCU read-side
- * critical sections.  If this function gains lots of uses, it might
- * make sense to provide versions for each flavor of RCU, but it does
- * not make sense as of early 2010.
- */
-#define rcu_dereference_index_check(p, c) \
-	__rcu_dereference_index_check((p), (c))
-
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e2e893144a8..13877cb93a6 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,103 +27,101 @@
 
 #include <linux/cache.h>
 
-void rcu_sched_qs(int cpu);
-void rcu_bh_qs(int cpu);
-static inline void rcu_note_context_switch(int cpu)
-{
-	rcu_sched_qs(cpu);
-}
+#define rcu_init_sched()	do { } while (0)
 
-#define __rcu_read_lock()	preempt_disable()
-#define __rcu_read_unlock()	preempt_enable()
-#define __rcu_read_lock_bh()	local_bh_disable()
-#define __rcu_read_unlock_bh()	local_bh_enable()
-#define call_rcu_sched		call_rcu
+#ifdef CONFIG_TINY_RCU
 
-#define rcu_init_sched()	do { } while (0)
-extern void rcu_check_callbacks(int cpu, int user);
+static inline void synchronize_rcu_expedited(void)
+{
+	synchronize_sched();	/* Only one CPU, so pretty fast anyway!!! */
+}
 
-static inline int rcu_needs_cpu(int cpu)
+static inline void rcu_barrier(void)
 {
-	return 0;
+	rcu_barrier_sched();  /* Only one CPU, so only one list of callbacks! */
 }
 
-/*
- * Return the number of grace periods.
- */
-static inline long rcu_batches_completed(void)
+#else /* #ifdef CONFIG_TINY_RCU */
+
+void rcu_barrier(void);
+void synchronize_rcu_expedited(void);
+
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+static inline void synchronize_rcu_bh(void)
 {
-	return 0;
+	synchronize_sched();
 }
 
-/*
- * Return the number of bottom-half grace periods.
- */
-static inline long rcu_batches_completed_bh(void)
+static inline void synchronize_rcu_bh_expedited(void)
 {
-	return 0;
+	synchronize_sched();
 }
 
-static inline void rcu_force_quiescent_state(void)
+#ifdef CONFIG_TINY_RCU
+
+static inline void rcu_preempt_note_context_switch(void)
 {
 }
 
-static inline void rcu_bh_force_quiescent_state(void)
+static inline void exit_rcu(void)
 {
 }
 
-static inline void rcu_sched_force_quiescent_state(void)
+static inline int rcu_needs_cpu(int cpu)
 {
+	return 0;
 }
 
-extern void synchronize_sched(void);
+#else /* #ifdef CONFIG_TINY_RCU */
+
+void rcu_preempt_note_context_switch(void);
+extern void exit_rcu(void);
+int rcu_preempt_needs_cpu(void);
 
-static inline void synchronize_rcu(void)
+static inline int rcu_needs_cpu(int cpu)
 {
-	synchronize_sched();
+	return rcu_preempt_needs_cpu();
 }
 
-static inline void synchronize_rcu_bh(void)
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+static inline void rcu_note_context_switch(int cpu)
 {
-	synchronize_sched();
+	rcu_sched_qs(cpu);
+	rcu_preempt_note_context_switch();
 }
 
-static inline void synchronize_rcu_expedited(void)
+/*
+ * Return the number of grace periods.
+ */
+static inline long rcu_batches_completed(void)
 {
-	synchronize_sched();
+	return 0;
 }
 
-static inline void synchronize_rcu_bh_expedited(void)
+/*
+ * Return the number of bottom-half grace periods.
+ */
+static inline long rcu_batches_completed_bh(void)
 {
-	synchronize_sched();
+	return 0;
 }
 
-struct notifier_block;
-
-#ifdef CONFIG_NO_HZ
-
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-
-#else /* #ifdef CONFIG_NO_HZ */
-
-static inline void rcu_enter_nohz(void)
+static inline void rcu_force_quiescent_state(void)
 {
 }
 
-static inline void rcu_exit_nohz(void)
+static inline void rcu_bh_force_quiescent_state(void)
 {
 }
 
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
-static inline void exit_rcu(void)
+static inline void rcu_sched_force_quiescent_state(void)
 {
 }
 
-static inline int rcu_preempt_depth(void)
+static inline void rcu_cpu_stall_reset(void)
 {
-	return 0;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index c0ed1c056f2..95518e62879 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,64 +30,23 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
-struct notifier_block;
-
-extern void rcu_sched_qs(int cpu);
-extern void rcu_bh_qs(int cpu);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
+extern void rcu_cpu_stall_reset(void);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
-extern void __rcu_read_lock(void);
-extern void __rcu_read_unlock(void);
-extern void synchronize_rcu(void);
 extern void exit_rcu(void);
 
-/*
- * Defined as macro as it is a very low level header
- * included from areas that don't even know about current
- */
-#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
-
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
-static inline void __rcu_read_lock(void)
-{
-	preempt_disable();
-}
-
-static inline void __rcu_read_unlock(void)
-{
-	preempt_enable();
-}
-
-#define synchronize_rcu synchronize_sched
-
 static inline void exit_rcu(void)
 {
 }
 
-static inline int rcu_preempt_depth(void)
-{
-	return 0;
-}
-
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
-static inline void __rcu_read_lock_bh(void)
-{
-	local_bh_disable();
-}
-static inline void __rcu_read_unlock_bh(void)
-{
-	local_bh_enable();
-}
-
-extern void call_rcu_sched(struct rcu_head *head,
-			   void (*func)(struct rcu_head *rcu));
 extern void synchronize_rcu_bh(void);
-extern void synchronize_sched(void);
 extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
@@ -95,7 +54,7 @@ static inline void synchronize_rcu_bh_expedited(void)
 	synchronize_sched_expedited();
 }
 
-extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_barrier(void);
 
 extern long rcu_batches_completed(void);
 extern long rcu_batches_completed_bh(void);
@@ -104,18 +63,6 @@ extern void rcu_force_quiescent_state(void);
 extern void rcu_bh_force_quiescent_state(void);
 extern void rcu_sched_force_quiescent_state(void);
 
-#ifdef CONFIG_NO_HZ
-void rcu_enter_nohz(void);
-void rcu_exit_nohz(void);
-#else /* CONFIG_NO_HZ */
-static inline void rcu_enter_nohz(void)
-{
-}
-static inline void rcu_exit_nohz(void)
-{
-}
-#endif /* CONFIG_NO_HZ */
-
 /* A context switch is a grace period for RCU-sched and RCU-bh. */
 static inline int rcu_blocking_is_gp(void)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db2d7d..0383601a927 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ enum sched_domain_level {
 	SD_LV_NONE = 0,
 	SD_LV_SIBLING,
 	SD_LV_MC,
+	SD_LV_BOOK,
 	SD_LV_CPU,
 	SD_LV_NODE,
 	SD_LV_ALLNODES,
@@ -1160,6 +1161,13 @@ struct sched_rt_entity {
 
 struct rcu_node;
 
+enum perf_event_task_context {
+	perf_invalid_context = -1,
+	perf_hw_context = 0,
+	perf_sw_context,
+	perf_nr_task_contexts,
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1202,11 +1210,13 @@ struct task_struct {
 	unsigned int policy;
 	cpumask_t cpus_allowed;
 
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
 	char rcu_read_unlock_special;
-	struct rcu_node *rcu_blocked_node;
 	struct list_head rcu_node_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -1288,9 +1298,9 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	const struct cred *real_cred;	/* objective and real subjective task
+	const struct cred __rcu *real_cred; /* objective and real subjective task
 					 * credentials (COW) */
-	const struct cred *cred;	/* effective (overridable) subjective task
+	const struct cred __rcu *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
@@ -1418,7 +1428,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_CGROUPS
 	/* Control Group info protected by css_set_lock */
-	struct css_set *cgroups;
+	struct css_set __rcu *cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock */
 	struct list_head cg_list;
 #endif
@@ -1431,7 +1441,7 @@ struct task_struct {
 	struct futex_pi_state *pi_state_cache;
 #endif
 #ifdef CONFIG_PERF_EVENTS
-	struct perf_event_context *perf_event_ctxp;
+	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
 	struct mutex perf_event_mutex;
 	struct list_head perf_event_list;
 #endif
@@ -1681,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 /*
  * Per process flags
  */
-#define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
-					/* Not implemented yet, only for 486*/
+#define PF_KSOFTIRQD	0x00000001	/* I am ksoftirqd */
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
@@ -1740,7 +1749,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
 
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
 #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
@@ -1749,7 +1758,9 @@ static inline void rcu_copy_process(struct task_struct *p)
 {
 	p->rcu_read_lock_nesting = 0;
 	p->rcu_read_unlock_special = 0;
+#ifdef CONFIG_TREE_PREEMPT_RCU
 	p->rcu_blocked_node = NULL;
+#endif
 	INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
@@ -1826,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #endif
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+ * The reason for this explicit opt-in is not to have perf penalty with
+ * slow sched_clocks.
+ */
+extern void enable_sched_clock_irqtime(void);
+extern void disable_sched_clock_irqtime(void);
+#else
+static inline void enable_sched_clock_irqtime(void) {}
+static inline void disable_sched_clock_irqtime(void) {}
+#endif
+
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@ -2367,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
 
 extern int __cond_resched_softirq(void);
 
-#define cond_resched_softirq() ({				\
-	__might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);	\
-	__cond_resched_softirq();				\
+#define cond_resched_softirq() ({					\
+	__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
+	__cond_resched_softirq();					\
 })
 
 /*
diff --git a/include/linux/security.h b/include/linux/security.h
index a22219afff0..b8246a8df7d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot,
 extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
 extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5);
-extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp);
+extern int cap_task_setscheduler(struct task_struct *p);
 extern int cap_task_setioprio(struct task_struct *p, int ioprio);
 extern int cap_task_setnice(struct task_struct *p, int nice);
 extern int cap_syslog(int type, bool from_file);
@@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Sets the new child socket's sid to the openreq sid.
  * @inet_conn_established:
  *	Sets the connection's peersid to the secmark on skb.
+ * @secmark_relabel_packet:
+ *	check if the process should be allowed to relabel packets to the given secid
+ * @security_secmark_refcount_inc
+ *	tells the LSM to increment the number of secmark labeling rules loaded
+ * @security_secmark_refcount_dec
+ *	tells the LSM to decrement the number of secmark labeling rules loaded
  * @req_classify_flow:
  *	Sets the flow's sid to the openreq sid.
  * @tun_dev_create:
@@ -1279,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Return 0 if permission is granted.
  *
  * @secid_to_secctx:
- *	Convert secid to security context.
+ *	Convert secid to security context.  If secdata is NULL the length of
+ *	the result will be returned in seclen, but no secdata will be returned.
+ *	This does mean that the length could change between calls to check the
+ *	length and the next call which actually allocates and returns the secdata.
  *	@secid contains the security ID.
  *	@secdata contains the pointer that stores the converted security context.
+ *	@seclen pointer which contains the length of the data
  * @secctx_to_secid:
  *	Convert security context to secid.
  *	@secid contains the pointer to the generated security ID.
@@ -1501,8 +1511,7 @@ struct security_operations {
 	int (*task_getioprio) (struct task_struct *p);
 	int (*task_setrlimit) (struct task_struct *p, unsigned int resource,
 			struct rlimit *new_rlim);
-	int (*task_setscheduler) (struct task_struct *p, int policy,
-				  struct sched_param *lp);
+	int (*task_setscheduler) (struct task_struct *p);
 	int (*task_getscheduler) (struct task_struct *p);
 	int (*task_movememory) (struct task_struct *p);
 	int (*task_kill) (struct task_struct *p,
@@ -1594,6 +1603,9 @@ struct security_operations {
 				  struct request_sock *req);
 	void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req);
 	void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb);
+	int (*secmark_relabel_packet) (u32 secid);
+	void (*secmark_refcount_inc) (void);
+	void (*secmark_refcount_dec) (void);
 	void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl);
 	int (*tun_dev_create)(void);
 	void (*tun_dev_post_create)(struct sock *sk);
@@ -1752,8 +1764,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio);
 int security_task_getioprio(struct task_struct *p);
 int security_task_setrlimit(struct task_struct *p, unsigned int resource,
 		struct rlimit *new_rlim);
-int security_task_setscheduler(struct task_struct *p,
-				int policy, struct sched_param *lp);
+int security_task_setscheduler(struct task_struct *p);
 int security_task_getscheduler(struct task_struct *p);
 int security_task_movememory(struct task_struct *p);
 int security_task_kill(struct task_struct *p, struct siginfo *info,
@@ -2320,11 +2331,9 @@ static inline int security_task_setrlimit(struct task_struct *p,
 	return 0;
 }
 
-static inline int security_task_setscheduler(struct task_struct *p,
-					     int policy,
-					     struct sched_param *lp)
+static inline int security_task_setscheduler(struct task_struct *p)
 {
-	return cap_task_setscheduler(p, policy, lp);
+	return cap_task_setscheduler(p);
 }
 
 static inline int security_task_getscheduler(struct task_struct *p)
@@ -2551,6 +2560,9 @@ void security_inet_csk_clone(struct sock *newsk,
 			const struct request_sock *req);
 void security_inet_conn_established(struct sock *sk,
 			struct sk_buff *skb);
+int security_secmark_relabel_packet(u32 secid);
+void security_secmark_refcount_inc(void);
+void security_secmark_refcount_dec(void);
 int security_tun_dev_create(void);
 void security_tun_dev_post_create(struct sock *sk);
 int security_tun_dev_attach(struct sock *sk);
@@ -2705,6 +2717,19 @@ static inline void security_inet_conn_established(struct sock *sk,
 {
 }
 
+static inline int security_secmark_relabel_packet(u32 secid)
+{
+	return 0;
+}
+
+static inline void security_secmark_refcount_inc(void)
+{
+}
+
+static inline void security_secmark_refcount_dec(void)
+{
+}
+
 static inline int security_tun_dev_create(void)
 {
 	return 0;
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 82e0f26a129..44f45961269 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -21,74 +21,11 @@ struct kern_ipc_perm;
 #ifdef CONFIG_SECURITY_SELINUX
 
 /**
- *     selinux_string_to_sid - map a security context string to a security ID
- *     @str: the security context string to be mapped
- *     @sid: ID value returned via this.
- *
- *     Returns 0 if successful, with the SID stored in sid.  A value
- *     of zero for sid indicates no SID could be determined (but no error
- *     occurred).
- */
-int selinux_string_to_sid(char *str, u32 *sid);
-
-/**
- *     selinux_secmark_relabel_packet_permission - secmark permission check
- *     @sid: SECMARK ID value to be applied to network packet
- *
- *     Returns 0 if the current task is allowed to set the SECMARK label of
- *     packets with the supplied security ID.  Note that it is implicit that
- *     the packet is always being relabeled from the default unlabeled value,
- *     and that the access control decision is made in the AVC.
- */
-int selinux_secmark_relabel_packet_permission(u32 sid);
-
-/**
- *     selinux_secmark_refcount_inc - increments the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function incements this reference count to indicate that a new SECMARK
- *     target has been configured.
- */
-void selinux_secmark_refcount_inc(void);
-
-/**
- *     selinux_secmark_refcount_dec - decrements the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function decements this reference count to indicate that one of the
- *     existing SECMARK targets has been removed/flushed.
- */
-void selinux_secmark_refcount_dec(void);
-
-/**
  * selinux_is_enabled - is SELinux enabled?
  */
 bool selinux_is_enabled(void);
 #else
 
-static inline int selinux_string_to_sid(const char *str, u32 *sid)
-{
-       *sid = 0;
-       return 0;
-}
-
-static inline int selinux_secmark_relabel_packet_permission(u32 sid)
-{
-	return 0;
-}
-
-static inline void selinux_secmark_refcount_inc(void)
-{
-	return;
-}
-
-static inline void selinux_secmark_refcount_dec(void)
-{
-	return;
-}
-
 static inline bool selinux_is_enabled(void)
 {
 	return false;
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4d5d2f546db..58971e891f4 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /**
- * srcu_dereference - fetch SRCU-protected pointer with checking
+ * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ *	really are in an SRCU read-side critical section.
+ * @c: condition to check for update-side use
  *
- * Makes rcu_dereference_check() do the dirty work.
+ * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
+ * critical section will result in an RCU-lockdep splat, unless @c evaluates
+ * to 1.  The @c argument will normally be a logical expression containing
+ * lockdep_is_held() calls.
  */
-#define srcu_dereference(p, sp) \
-		rcu_dereference_check(p, srcu_read_lock_held(sp))
+#define srcu_dereference_check(p, sp, c) \
+	__rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu)
+
+/**
+ * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ *	really are in an SRCU read-side critical section.
+ *
+ * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
+ * is enabled, invoking this outside of an RCU read-side critical
+ * section will result in an RCU-lockdep splat.
+ */
+#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0)
 
 /**
  * srcu_read_lock - register a new reader for an SRCU-protected structure.
  * @sp: srcu_struct in which to register the new reader.
  *
  * Enter an SRCU read-side critical section.  Note that SRCU read-side
- * critical sections may be nested.
+ * critical sections may be nested.  However, it is illegal to
+ * call anything that waits on an SRCU grace period for the same
+ * srcu_struct, whether directly or indirectly.  Please note that
+ * one way to indirectly wait on an SRCU grace period is to acquire
+ * a mutex that is held elsewhere while calling synchronize_srcu() or
+ * synchronize_srcu_expedited().
  */
 static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 {
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 6b524a0d02e..1808960c505 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -126,8 +126,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
 #else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
-static inline int stop_machine(int (*fn)(void *), void *data,
-			       const struct cpumask *cpus)
+static inline int __stop_machine(int (*fn)(void *), void *data,
+				 const struct cpumask *cpus)
 {
 	int ret;
 	local_irq_disable();
@@ -136,5 +136,11 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
+static inline int stop_machine(int (*fn)(void *), void *data,
+			       const struct cpumask *cpus)
+{
+	return __stop_machine(fn, data, cpus);
+}
+
 #endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
 #endif	/* _LINUX_STOP_MACHINE */
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 671538d25bc..8eee9dbbfe7 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,7 +69,7 @@ struct gss_cl_ctx {
 	enum rpc_gss_proc	gc_proc;
 	u32			gc_seq;
 	spinlock_t		gc_seq_lock;
-	struct gss_ctx		*gc_gss_ctx;
+	struct gss_ctx __rcu	*gc_gss_ctx;
 	struct xdr_netobj	gc_wire_ctx;
 	u32			gc_win;
 	unsigned long		gc_expiry;
@@ -80,7 +80,7 @@ struct gss_upcall_msg;
 struct gss_cred {
 	struct rpc_cred		gc_base;
 	enum rpc_gss_svc	gc_service;
-	struct gss_cl_ctx	*gc_ctx;
+	struct gss_cl_ctx __rcu	*gc_ctx;
 	struct gss_upcall_msg	*gc_upcall;
 	unsigned long		gc_upcall_timestamp;
 	unsigned char		gc_machine_cred : 1;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 64e084ff5e5..b91a40e847d 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
 	.balance_interval	= 64,					\
 }
 
+#ifdef CONFIG_SCHED_BOOK
+#ifndef SD_BOOK_INIT
+#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
+#endif
+#endif /* CONFIG_SCHED_BOOK */
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 103d1b61aac..a4a90b6726c 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/rcupdate.h>
+#include <linux/jump_label.h>
 
 struct module;
 struct tracepoint;
@@ -145,7 +146,9 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	{								\
-		if (unlikely(__tracepoint_##name.state))		\
+		JUMP_LABEL(&__tracepoint_##name.state, do_trace);	\
+		return;							\
+do_trace:								\
 			__DO_TRACE(&__tracepoint_##name,		\
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args));			\
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index ef6c24a529e..a4dc5b027bd 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
 		return 0;
 
 	rcu_read_lock();
-	id = rcu_dereference(net_cls_subsys_id);
+	id = rcu_dereference_index_check(net_cls_subsys_id,
+					 rcu_read_lock_held());
 	if (id >= 0)
 		classid = container_of(task_subsys_state(p, id),
 				       struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e624dae54fa..caf17db87db 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -75,7 +75,7 @@ struct nf_conntrack_helper;
 /* nf_conn feature for connections that have a helper */
 struct nf_conn_help {
 	/* Helper. if any */
-	struct nf_conntrack_helper *helper;
+	struct nf_conntrack_helper __rcu *helper;
 
 	union nf_conntrack_help help;
 
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 0e4cfb694fe..6fa7cbab7d9 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -5,7 +5,9 @@
 #define _TRACE_IRQ_H
 
 #include <linux/tracepoint.h>
-#include <linux/interrupt.h>
+
+struct irqaction;
+struct softirq_action;
 
 #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
 #define show_softirq_name(val)				\
@@ -93,7 +95,10 @@ DECLARE_EVENT_CLASS(softirq,
 	),
 
 	TP_fast_assign(
-		__entry->vec = (int)(h - vec);
+		if (vec)
+			__entry->vec = (int)(h - vec);
+		else
+			__entry->vec = (int)(long)h;
 	),
 
 	TP_printk("vec=%d [action=%s]", __entry->vec,
@@ -136,6 +141,23 @@ DEFINE_EVENT(softirq, softirq_exit,
 	TP_ARGS(h, vec)
 );
 
+/**
+ * softirq_raise - called immediately when a softirq is raised
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the softirq vector number which is
+ * raised. @vec is NULL and it means @h includes vector number not
+ * softirq_action. When used in combination with the softirq_entry tracepoint
+ * we can determine the softirq raise latency.
+ */
+DEFINE_EVENT(softirq, softirq_raise,
+
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+	TP_ARGS(h, vec)
+);
+
 #endif /*  _TRACE_IRQ_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index 188deca2f3c..8fe1e93f531 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -6,10 +6,31 @@
 
 #include <linux/netdevice.h>
 #include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+
+#define NO_DEV "(no_device)"
+
+TRACE_EVENT(napi_poll,
 
-DECLARE_TRACE(napi_poll,
 	TP_PROTO(struct napi_struct *napi),
-	TP_ARGS(napi));
+
+	TP_ARGS(napi),
+
+	TP_STRUCT__entry(
+		__field(	struct napi_struct *,	napi)
+		__string(	dev_name, napi->dev ? napi->dev->name : NO_DEV)
+	),
+
+	TP_fast_assign(
+		__entry->napi = napi;
+		__assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
+	),
+
+	TP_printk("napi poll on napi struct %p for device %s",
+		__entry->napi, __get_str(dev_name))
+);
+
+#undef NO_DEV
 
 #endif /* _TRACE_NAPI_H_ */
 
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
new file mode 100644
index 00000000000..5f247f5ffc5
--- /dev/null
+++ b/include/trace/events/net.h
@@ -0,0 +1,82 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM net
+
+#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(net_dev_xmit,
+
+	TP_PROTO(struct sk_buff *skb,
+		 int rc),
+
+	TP_ARGS(skb, rc),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__field(	int,		rc		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__entry->rc = rc;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+		__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
+);
+
+DECLARE_EVENT_CLASS(net_dev_template,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u",
+		__get_str(name), __entry->skbaddr, __entry->len)
+)
+
+DEFINE_EVENT(net_dev_template, net_dev_queue,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_template, netif_receive_skb,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_template, netif_rx,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+#endif /* _TRACE_NET_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 35a2a6e7bf1..286784d69b8 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -10,12 +10,17 @@
 #ifndef _TRACE_POWER_ENUM_
 #define _TRACE_POWER_ENUM_
 enum {
-	POWER_NONE = 0,
-	POWER_CSTATE = 1,
-	POWER_PSTATE = 2,
+	POWER_NONE	= 0,
+	POWER_CSTATE	= 1,	/* C-State */
+	POWER_PSTATE	= 2,	/* Fequency change or DVFS */
+	POWER_SSTATE	= 3,	/* Suspend */
 };
 #endif
 
+/*
+ * The power events are used for cpuidle & suspend (power_start, power_end)
+ *  and for cpufreq (power_frequency)
+ */
 DECLARE_EVENT_CLASS(power,
 
 	TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
@@ -70,6 +75,85 @@ TRACE_EVENT(power_end,
 
 );
 
+/*
+ * The clock events are used for clock enable/disable and for
+ *  clock rate change
+ */
+DECLARE_EVENT_CLASS(clock,
+
+	TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+	TP_ARGS(name, state, cpu_id),
+
+	TP_STRUCT__entry(
+		__string(       name,           name            )
+		__field(        u64,            state           )
+		__field(        u64,            cpu_id          )
+	),
+
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->state = state;
+		__entry->cpu_id = cpu_id;
+	),
+
+	TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
+		(unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_enable,
+
+	TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+	TP_ARGS(name, state, cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_disable,
+
+	TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+	TP_ARGS(name, state, cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_set_rate,
+
+	TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+	TP_ARGS(name, state, cpu_id)
+);
+
+/*
+ * The power domain events are used for power domains transitions
+ */
+DECLARE_EVENT_CLASS(power_domain,
+
+	TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+	TP_ARGS(name, state, cpu_id),
+
+	TP_STRUCT__entry(
+		__string(       name,           name            )
+		__field(        u64,            state           )
+		__field(        u64,            cpu_id          )
+	),
+
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->state = state;
+		__entry->cpu_id = cpu_id;
+),
+
+	TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
+		(unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(power_domain, power_domain_target,
+
+	TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+	TP_ARGS(name, state, cpu_id)
+);
+
 #endif /* _TRACE_POWER_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9208c92aeab..f6334782a59 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
 			(unsigned long long)__entry->vruntime)
 );
 
+/*
+ * Tracepoint for showing priority inheritance modifying a tasks
+ * priority.
+ */
+TRACE_EVENT(sched_pi_setprio,
+
+	TP_PROTO(struct task_struct *tsk, int newprio),
+
+	TP_ARGS(tsk, newprio),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid			)
+		__field( int,	oldprio			)
+		__field( int,	newprio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		= tsk->pid;
+		__entry->oldprio	= tsk->prio;
+		__entry->newprio	= newprio;
+	),
+
+	TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
+			__entry->comm, __entry->pid,
+			__entry->oldprio, __entry->newprio)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 4b2be6dc76f..75ce9d500d8 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -35,6 +35,23 @@ TRACE_EVENT(kfree_skb,
 		__entry->skbaddr, __entry->protocol, __entry->location)
 );
 
+TRACE_EVENT(consume_skb,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,	skbaddr	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+	),
+
+	TP_printk("skbaddr=%p", __entry->skbaddr)
+);
+
 TRACE_EVENT(skb_copy_datagram_iovec,
 
 	TP_PROTO(const struct sk_buff *skb, int len),
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1cbadd..7b920aafa98 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -21,6 +21,13 @@ config CONSTRUCTORS
 	depends on !UML
 	default y
 
+config HAVE_IRQ_WORK
+	bool
+
+config IRQ_WORK
+	bool
+	depends on HAVE_IRQ_WORK
+
 menu "General setup"
 
 config EXPERIMENTAL
@@ -340,6 +347,7 @@ choice
 
 config TREE_RCU
 	bool "Tree-based hierarchical RCU"
+	depends on !PREEMPT && SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP system with hundreds or
@@ -347,7 +355,7 @@ config TREE_RCU
 	  smaller systems.
 
 config TREE_PREEMPT_RCU
-	bool "Preemptable tree-based hierarchical RCU"
+	bool "Preemptible tree-based hierarchical RCU"
 	depends on PREEMPT
 	help
 	  This option selects the RCU implementation that is
@@ -365,8 +373,22 @@ config TINY_RCU
 	  is not required.  This option greatly reduces the
 	  memory footprint of RCU.
 
+config TINY_PREEMPT_RCU
+	bool "Preemptible UP-only small-memory-footprint RCU"
+	depends on !SMP && PREEMPT
+	help
+	  This option selects the RCU implementation that is designed
+	  for real-time UP systems.  This option greatly reduces the
+	  memory footprint of RCU.
+
 endchoice
 
+config PREEMPT_RCU
+	def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
+	help
+	  This option enables preemptible-RCU code that is common between
+	  the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on TREE_RCU || TREE_PREEMPT_RCU
@@ -387,9 +409,12 @@ config RCU_FANOUT
 	help
 	  This option controls the fanout of hierarchical implementations
 	  of RCU, allowing RCU to work efficiently on machines with
-	  large numbers of CPUs.  This value must be at least the cube
-	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
-	  systems and up to 262,144 for 64-bit systems.
+	  large numbers of CPUs.  This value must be at least the fourth
+	  root of NR_CPUS, which allows NR_CPUS to be insanely large.
+	  The default value of RCU_FANOUT should be used for production
+	  systems, but if you are stress-testing the RCU implementation
+	  itself, small RCU_FANOUT values allow you to test large-system
+	  code paths on small(er) systems.
 
 	  Select a specific number if testing RCU itself.
 	  Take the default if unsure.
@@ -987,6 +1012,7 @@ config PERF_EVENTS
 	default y if (PROFILING || PERF_COUNTERS)
 	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
+	select IRQ_WORK
 	help
 	  Enable kernel support for various performance events provided
 	  by software and hardware.
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be..e2c9d52cfe9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-	    async.o range.o
+	    async.o range.o jump_label.o
 obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -86,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
 obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -100,6 +102,7 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f614..291ba3d04be 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
 	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
 	 * css_tryget() should be used for avoiding race.
 	 */
-	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state __rcu *css;
 	/*
 	 * ID of this css.
 	 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe..51b143e2a07 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	if (tsk->flags & PF_THREAD_BOUND)
 		return -EINVAL;
 
-	ret = security_task_setscheduler(tsk, 0, NULL);
+	ret = security_task_setscheduler(tsk);
 	if (ret)
 		return ret;
 	if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 
 		rcu_read_lock();
 		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-			ret = security_task_setscheduler(c, 0, NULL);
+			ret = security_task_setscheduler(c);
 			if (ret) {
 				rcu_read_unlock();
 				return ret;
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db2..e2bdf37f9fd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
-#ifdef CONFIG_PERF_EVENTS
-	WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
+	perf_event_delayed_put(tsk);
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac..53ead174da2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
 			" disables this message.\n");
 	sched_show_task(t);
-	__debug_show_held_locks(t);
+	debug_show_held_locks(t);
 
 	touch_nmi_watchdog();
 
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
  * periodically exit the critical section and enter a new one.
  *
  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
- * exit the grace period. For classic RCU, a reschedule is required.
+ * to exit the grace period. For classic RCU, a reschedule is required.
  */
 static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
 {
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c7c2aed9e2d..2c9120f0afc 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
  */
 static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-	struct perf_event_context *ctx = bp->ctx;
+	struct task_struct *tsk = bp->hw.bp_target;
 	struct perf_event *iter;
 	int count = 0;
 
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-		if (iter->ctx == ctx && find_slot_idx(iter) == type)
+		if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
 			count += hw_breakpoint_weight(iter);
 	}
 
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		    enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
-	struct task_struct *tsk = bp->ctx->task;
+	struct task_struct *tsk = bp->hw.bp_target;
 
 	if (cpu >= 0) {
 		slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 	       int weight)
 {
 	int cpu = bp->cpu;
-	struct task_struct *tsk = bp->ctx->task;
+	struct task_struct *tsk = bp->hw.bp_target;
 
 	/* Pinned counter cpu profiling */
 	if (!tsk) {
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
 			    struct task_struct *tsk)
 {
-	return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
-						triggered);
+	return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		pevent = per_cpu_ptr(cpu_events, cpu);
-		bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+		bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
 
 		*pevent = bp;
 
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 	.priority = 0x7fffffff
 };
 
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+	release_bp_slot(event);
+}
+
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+	int err;
+
+	if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+		return -ENOENT;
+
+	err = register_perf_hw_breakpoint(bp);
+	if (err)
+		return err;
+
+	bp->destroy = bp_perf_event_destroy;
+
+	return 0;
+}
+
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+	if (!(flags & PERF_EF_START))
+		bp->hw.state = PERF_HES_STOPPED;
+
+	return arch_install_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+	arch_uninstall_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+	bp->hw.state = 0;
+}
+
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+	bp->hw.state = PERF_HES_STOPPED;
+}
+
+static struct pmu perf_breakpoint = {
+	.task_ctx_nr	= perf_sw_context, /* could eventually get its own */
+
+	.event_init	= hw_breakpoint_event_init,
+	.add		= hw_breakpoint_add,
+	.del		= hw_breakpoint_del,
+	.start		= hw_breakpoint_start,
+	.stop		= hw_breakpoint_stop,
+	.read		= hw_breakpoint_pmu_read,
+};
+
 static int __init init_hw_breakpoint(void)
 {
 	unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
 
 	constraints_initialized = 1;
 
+	perf_pmu_register(&perf_breakpoint);
+
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
 
  err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
 core_initcall(init_hw_breakpoint);
 
 
-struct pmu perf_ops_bp = {
-	.enable		= arch_install_hw_breakpoint,
-	.disable	= arch_uninstall_hw_breakpoint,
-	.read		= hw_breakpoint_pmu_read,
-};
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 00000000000..f16763ff848
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+
+/*
+ * An entry can be in one of four states:
+ *
+ * free	     NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+
+#define IRQ_WORK_PENDING	1UL
+#define IRQ_WORK_BUSY		2UL
+#define IRQ_WORK_FLAGS		3UL
+
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+	return (unsigned long)entry->next & flags;
+}
+
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+	unsigned long next = (unsigned long)entry->next;
+	next &= ~IRQ_WORK_FLAGS;
+	return (struct irq_work *)next;
+}
+
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+	unsigned long next = (unsigned long)entry;
+	next |= flags;
+	return (struct irq_work *)next;
+}
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+	struct irq_work *next, *nflags;
+
+	do {
+		next = entry->next;
+		if ((unsigned long)next & IRQ_WORK_PENDING)
+			return false;
+		nflags = next_flags(next, IRQ_WORK_FLAGS);
+	} while (cmpxchg(&entry->next, next, nflags) != next);
+
+	return true;
+}
+
+
+void __weak arch_irq_work_raise(void)
+{
+	/*
+	 * Lame architectures will get the timer tick callback
+	 */
+}
+
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+	struct irq_work **head, *next;
+
+	head = &get_cpu_var(irq_work_list);
+
+	do {
+		next = *head;
+		/* Can assign non-atomic because we keep the flags set. */
+		entry->next = next_flags(next, IRQ_WORK_FLAGS);
+	} while (cmpxchg(head, next, entry) != next);
+
+	/* The list was empty, raise self-interrupt to start processing. */
+	if (!irq_work_next(entry))
+		arch_irq_work_raise();
+
+	put_cpu_var(irq_work_list);
+}
+
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+	if (!irq_work_claim(entry)) {
+		/*
+		 * Already enqueued, can't do!
+		 */
+		return false;
+	}
+
+	__irq_work_queue(entry);
+	return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+	struct irq_work *list, **head;
+
+	head = &__get_cpu_var(irq_work_list);
+	if (*head == NULL)
+		return;
+
+	BUG_ON(!in_irq());
+	BUG_ON(!irqs_disabled());
+
+	list = xchg(head, NULL);
+	while (list != NULL) {
+		struct irq_work *entry = list;
+
+		list = irq_work_next(list);
+
+		/*
+		 * Clear the PENDING bit, after this point the @entry
+		 * can be re-used.
+		 */
+		entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+		entry->func(entry);
+		/*
+		 * Clear the BUSY bit and return to the free state if
+		 * no-one else claimed it meanwhile.
+		 */
+		cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+	}
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+	WARN_ON_ONCE(irqs_disabled());
+
+	while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+		cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 00000000000..7be868bf25c
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
+/*
+ * jump label support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/err.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+#define JUMP_LABEL_HASH_BITS 6
+#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
+static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
+
+/* mutex to protect coming/going of the the jump_label table */
+static DEFINE_MUTEX(jump_label_mutex);
+
+struct jump_label_entry {
+	struct hlist_node hlist;
+	struct jump_entry *table;
+	int nr_entries;
+	/* hang modules off here */
+	struct hlist_head modules;
+	unsigned long key;
+};
+
+struct jump_label_module_entry {
+	struct hlist_node hlist;
+	struct jump_entry *table;
+	int nr_entries;
+	struct module *mod;
+};
+
+static int jump_label_cmp(const void *a, const void *b)
+{
+	const struct jump_entry *jea = a;
+	const struct jump_entry *jeb = b;
+
+	if (jea->key < jeb->key)
+		return -1;
+
+	if (jea->key > jeb->key)
+		return 1;
+
+	return 0;
+}
+
+static void
+sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+{
+	unsigned long size;
+
+	size = (((unsigned long)stop - (unsigned long)start)
+					/ sizeof(struct jump_entry));
+	sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+}
+
+static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct jump_label_entry *e;
+	u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+
+	head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (key == e->key)
+			return e;
+	}
+	return NULL;
+}
+
+static struct jump_label_entry *
+add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
+{
+	struct hlist_head *head;
+	struct jump_label_entry *e;
+	u32 hash;
+
+	e = get_jump_label_entry(key);
+	if (e)
+		return ERR_PTR(-EEXIST);
+
+	e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+
+	hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+	head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+	e->key = key;
+	e->table = table;
+	e->nr_entries = nr_entries;
+	INIT_HLIST_HEAD(&(e->modules));
+	hlist_add_head(&e->hlist, head);
+	return e;
+}
+
+static int
+build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
+{
+	struct jump_entry *iter, *iter_begin;
+	struct jump_label_entry *entry;
+	int count;
+
+	sort_jump_label_entries(start, stop);
+	iter = start;
+	while (iter < stop) {
+		entry = get_jump_label_entry(iter->key);
+		if (!entry) {
+			iter_begin = iter;
+			count = 0;
+			while ((iter < stop) &&
+				(iter->key == iter_begin->key)) {
+				iter++;
+				count++;
+			}
+			entry = add_jump_label_entry(iter_begin->key,
+							count, iter_begin);
+			if (IS_ERR(entry))
+				return PTR_ERR(entry);
+		 } else {
+			WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/***
+ * jump_label_update - update jump label text
+ * @key -  key value associated with a a jump label
+ * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
+ *
+ * Will enable/disable the jump for jump label @key, depending on the
+ * value of @type.
+ *
+ */
+
+void jump_label_update(unsigned long key, enum jump_label_type type)
+{
+	struct jump_entry *iter;
+	struct jump_label_entry *entry;
+	struct hlist_node *module_node;
+	struct jump_label_module_entry *e_module;
+	int count;
+
+	mutex_lock(&jump_label_mutex);
+	entry = get_jump_label_entry((jump_label_t)key);
+	if (entry) {
+		count = entry->nr_entries;
+		iter = entry->table;
+		while (count--) {
+			if (kernel_text_address(iter->code))
+				arch_jump_label_transform(iter, type);
+			iter++;
+		}
+		/* eanble/disable jump labels in modules */
+		hlist_for_each_entry(e_module, module_node, &(entry->modules),
+							hlist) {
+			count = e_module->nr_entries;
+			iter = e_module->table;
+			while (count--) {
+				if (kernel_text_address(iter->code))
+					arch_jump_label_transform(iter, type);
+				iter++;
+			}
+		}
+	}
+	mutex_unlock(&jump_label_mutex);
+}
+
+static int addr_conflict(struct jump_entry *entry, void *start, void *end)
+{
+	if (entry->code <= (unsigned long)end &&
+		entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+		return 1;
+
+	return 0;
+}
+
+#ifdef CONFIG_MODULES
+
+static int module_conflict(void *start, void *end)
+{
+	struct hlist_head *head;
+	struct hlist_node *node, *node_next, *module_node, *module_node_next;
+	struct jump_label_entry *e;
+	struct jump_label_module_entry *e_module;
+	struct jump_entry *iter;
+	int i, count;
+	int conflict = 0;
+
+	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+		head = &jump_label_table[i];
+		hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+			hlist_for_each_entry_safe(e_module, module_node,
+							module_node_next,
+							&(e->modules), hlist) {
+				count = e_module->nr_entries;
+				iter = e_module->table;
+				while (count--) {
+					if (addr_conflict(iter, start, end)) {
+						conflict = 1;
+						goto out;
+					}
+					iter++;
+				}
+			}
+		}
+	}
+out:
+	return conflict;
+}
+
+#endif
+
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+	struct jump_entry *iter;
+	struct jump_entry *iter_start = __start___jump_table;
+	struct jump_entry *iter_stop = __start___jump_table;
+	int conflict = 0;
+
+	mutex_lock(&jump_label_mutex);
+	iter = iter_start;
+	while (iter < iter_stop) {
+		if (addr_conflict(iter, start, end)) {
+			conflict = 1;
+			goto out;
+		}
+		iter++;
+	}
+
+	/* now check modules */
+#ifdef CONFIG_MODULES
+	conflict = module_conflict(start, end);
+#endif
+out:
+	mutex_unlock(&jump_label_mutex);
+	return conflict;
+}
+
+static __init int init_jump_label(void)
+{
+	int ret;
+	struct jump_entry *iter_start = __start___jump_table;
+	struct jump_entry *iter_stop = __stop___jump_table;
+	struct jump_entry *iter;
+
+	mutex_lock(&jump_label_mutex);
+	ret = build_jump_label_hashtable(__start___jump_table,
+					 __stop___jump_table);
+	iter = iter_start;
+	while (iter < iter_stop) {
+		arch_jump_label_text_poke_early(iter->code);
+		iter++;
+	}
+	mutex_unlock(&jump_label_mutex);
+	return ret;
+}
+early_initcall(init_jump_label);
+
+#ifdef CONFIG_MODULES
+
+static struct jump_label_module_entry *
+add_jump_label_module_entry(struct jump_label_entry *entry,
+			    struct jump_entry *iter_begin,
+			    int count, struct module *mod)
+{
+	struct jump_label_module_entry *e;
+
+	e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+	e->mod = mod;
+	e->nr_entries = count;
+	e->table = iter_begin;
+	hlist_add_head(&e->hlist, &entry->modules);
+	return e;
+}
+
+static int add_jump_label_module(struct module *mod)
+{
+	struct jump_entry *iter, *iter_begin;
+	struct jump_label_entry *entry;
+	struct jump_label_module_entry *module_entry;
+	int count;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (!mod->num_jump_entries)
+		return 0;
+
+	sort_jump_label_entries(mod->jump_entries,
+				mod->jump_entries + mod->num_jump_entries);
+	iter = mod->jump_entries;
+	while (iter < mod->jump_entries + mod->num_jump_entries) {
+		entry = get_jump_label_entry(iter->key);
+		iter_begin = iter;
+		count = 0;
+		while ((iter < mod->jump_entries + mod->num_jump_entries) &&
+			(iter->key == iter_begin->key)) {
+				iter++;
+				count++;
+		}
+		if (!entry) {
+			entry = add_jump_label_entry(iter_begin->key, 0, NULL);
+			if (IS_ERR(entry))
+				return PTR_ERR(entry);
+		}
+		module_entry = add_jump_label_module_entry(entry, iter_begin,
+							   count, mod);
+		if (IS_ERR(module_entry))
+			return PTR_ERR(module_entry);
+	}
+	return 0;
+}
+
+static void remove_jump_label_module(struct module *mod)
+{
+	struct hlist_head *head;
+	struct hlist_node *node, *node_next, *module_node, *module_node_next;
+	struct jump_label_entry *e;
+	struct jump_label_module_entry *e_module;
+	int i;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (!mod->num_jump_entries)
+		return;
+
+	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+		head = &jump_label_table[i];
+		hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+			hlist_for_each_entry_safe(e_module, module_node,
+						  module_node_next,
+						  &(e->modules), hlist) {
+				if (e_module->mod == mod) {
+					hlist_del(&e_module->hlist);
+					kfree(e_module);
+				}
+			}
+			if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
+				hlist_del(&e->hlist);
+				kfree(e);
+			}
+		}
+	}
+}
+
+static int
+jump_label_module_notify(struct notifier_block *self, unsigned long val,
+			 void *data)
+{
+	struct module *mod = data;
+	int ret = 0;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		mutex_lock(&jump_label_mutex);
+		ret = add_jump_label_module(mod);
+		if (ret)
+			remove_jump_label_module(mod);
+		mutex_unlock(&jump_label_mutex);
+		break;
+	case MODULE_STATE_GOING:
+		mutex_lock(&jump_label_mutex);
+		remove_jump_label_module(mod);
+		mutex_unlock(&jump_label_mutex);
+		break;
+	}
+	return ret;
+}
+
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
+{
+	struct jump_entry *iter;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (!mod->num_jump_entries)
+		return;
+
+	iter = mod->jump_entries;
+	while (iter < mod->jump_entries + mod->num_jump_entries) {
+		arch_jump_label_text_poke_early(iter->code);
+		iter++;
+	}
+}
+
+struct notifier_block jump_label_module_nb = {
+	.notifier_call = jump_label_module_notify,
+	.priority = 0,
+};
+
+static __init int init_jump_label_module(void)
+{
+	return register_module_notifier(&jump_label_module_nb);
+}
+early_initcall(init_jump_label_module);
+
+#endif /* CONFIG_MODULES */
+
+#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae9..ec4210c6501 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
 #include <linux/memory.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/jump_label.h>
 
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p)
  * Return an optimized kprobe whose optimizing code replaces
  * instructions including addr (exclude breakpoint).
  */
-struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 {
 	int i;
 	struct kprobe *p = NULL;
@@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 
 void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 			 struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
 	spinlock_t *hlist_lock;
@@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 
 static void __kprobes kretprobe_table_lock(unsigned long hash,
 	unsigned long *flags)
+__acquires(hlist_lock)
 {
 	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
 	spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
 
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 	unsigned long *flags)
+__releases(hlist_lock)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
 	spinlock_t *hlist_lock;
@@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 	spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
-void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_unlock(unsigned long hash,
+       unsigned long *flags)
+__releases(hlist_lock)
 {
 	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
 	spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p)
 	preempt_disable();
 	if (!kernel_text_address((unsigned long) p->addr) ||
 	    in_kprobes_functions((unsigned long) p->addr) ||
-	    ftrace_text_reserved(p->addr, p->addr)) {
+	    ftrace_text_reserved(p->addr, p->addr) ||
+	    jump_label_text_reserved(p->addr, p->addr)) {
 		preempt_enable();
 		return -EINVAL;
 	}
@@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
 	if (num <= 0)
 		return -EINVAL;
 	for (i = 0; i < num; i++) {
-		unsigned long addr;
+		unsigned long addr, offset;
 		jp = jps[i];
 		addr = arch_deref_entry_point(jp->entry);
 
-		if (!kernel_text_address(addr))
-			ret = -EINVAL;
-		else {
-			/* Todo: Verify probepoint is a function entry point */
+		/* Verify probepoint is a function entry point */
+		if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
+		    offset == 0) {
 			jp->kp.pre_handler = setjmp_pre_handler;
 			jp->kp.break_handler = longjmp_break_handler;
 			ret = register_kprobe(&jp->kp);
-		}
+		} else
+			ret = -EINVAL;
+
 		if (ret < 0) {
 			if (i > 0)
 				unregister_jprobes(jps, i);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a51023..42ba65dff7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	}
 #endif
 
+	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+		debug_locks_off();
+		printk(KERN_ERR
+			"BUG: looking up invalid subclass: %u\n", subclass);
+		printk(KERN_ERR
+			"turning off the locking correctness validator.\n");
+		dump_stack();
+		return NULL;
+	}
+
 	/*
 	 * Static locks do not have their class-keys yet - for them the key
 	 * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
 	raw_local_irq_restore(flags);
 
 	if (!subclass || force)
-		lock->class_cache = class;
+		lock->class_cache[0] = class;
+	else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+		lock->class_cache[subclass] = class;
 
 	if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
 		return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		      struct lock_class_key *key, int subclass)
 {
-	lock->class_cache = NULL;
+	int i;
+
+	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+		lock->class_cache[i] = NULL;
+
 #ifdef CONFIG_LOCK_STAT
 	lock->cpu = raw_smp_processor_id();
 #endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
 
-	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
-		debug_locks_off();
-		printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
-		printk("turning off the locking correctness validator.\n");
-		dump_stack();
-		return 0;
-	}
-
 	if (lock->key == &__lockdep_no_validate__)
 		check = 1;
 
-	if (!subclass)
-		class = lock->class_cache;
+	if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+		class = lock->class_cache[subclass];
 	/*
-	 * Not cached yet or subclass?
+	 * Not cached?
 	 */
 	if (unlikely(!class)) {
 		class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 		return 1;
 
 	if (hlock->references) {
-		struct lock_class *class = lock->class_cache;
+		struct lock_class *class = lock->class_cache[0];
 
 		if (!class)
 			class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 		if (list_empty(head))
 			continue;
 		list_for_each_entry_safe(class, next, head, hash_entry) {
-			if (unlikely(class == lock->class_cache)) {
+			int match = 0;
+
+			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+				match |= class == lock->class_cache[j];
+
+			if (unlikely(match)) {
 				if (debug_locks_off_graph_unlock())
 					WARN_ON(1);
 				goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
  * Careful: only use this function if you are sure that
  * the task cannot run in parallel!
  */
-void __debug_show_held_locks(struct task_struct *task)
+void debug_show_held_locks(struct task_struct *task)
 {
 	if (unlikely(!debug_locks)) {
 		printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
 	}
 	lockdep_print_held_locks(task);
 }
-EXPORT_SYMBOL_GPL(__debug_show_held_locks);
-
-void debug_show_held_locks(struct task_struct *task)
-{
-		__debug_show_held_locks(task);
-}
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
 void lockdep_sys_exit(void)
diff --git a/kernel/module.c b/kernel/module.c
index ccd64199184..2df46301a7a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/jump_label.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -2309,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 					sizeof(*mod->tracepoints),
 					&mod->num_tracepoints);
 #endif
+#ifdef HAVE_JUMP_LABEL
+	mod->jump_entries = section_objs(info, "__jump_table",
+					sizeof(*mod->jump_entries),
+					&mod->num_jump_entries);
+#endif
 #ifdef CONFIG_EVENT_TRACING
 	mod->trace_events = section_objs(info, "_ftrace_events",
 					 sizeof(*mod->trace_events),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b98bed3d818..f309e8014c7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
-/*
- * Each CPU has a list of per CPU events:
- */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
-int perf_max_events __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
-static atomic_t nr_events __read_mostly;
+atomic_t perf_task_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
 /*
  * perf event paranoia level:
  *  -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_event_id;
 
-/*
- * Lock for (sysadmin-configurable) event reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
+void __weak perf_event_print_debug(void)	{ }
 
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+extern __weak const char *perf_pmu_name(void)
 {
-	return NULL;
+	return "pmu";
 }
 
-void __weak hw_perf_disable(void)		{ barrier(); }
-void __weak hw_perf_enable(void)		{ barrier(); }
-
-void __weak perf_event_print_debug(void)	{ }
-
-static DEFINE_PER_CPU(int, perf_disable_count);
+void perf_pmu_disable(struct pmu *pmu)
+{
+	int *count = this_cpu_ptr(pmu->pmu_disable_count);
+	if (!(*count)++)
+		pmu->pmu_disable(pmu);
+}
 
-void perf_disable(void)
+void perf_pmu_enable(struct pmu *pmu)
 {
-	if (!__get_cpu_var(perf_disable_count)++)
-		hw_perf_disable();
+	int *count = this_cpu_ptr(pmu->pmu_disable_count);
+	if (!--(*count))
+		pmu->pmu_enable(pmu);
 }
 
-void perf_enable(void)
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
 {
-	if (!--__get_cpu_var(perf_disable_count))
-		hw_perf_enable();
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+	struct list_head *head = &__get_cpu_var(rotation_list);
+
+	WARN_ON(!irqs_disabled());
+
+	if (list_empty(&cpuctx->rotation_list))
+		list_add(&cpuctx->rotation_list, head);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
  * the context could get moved to another task.
  */
 static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
 	struct perf_event_context *ctx;
 
 	rcu_read_lock();
- retry:
-	ctx = rcu_dereference(task->perf_event_ctxp);
+retry:
+	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
 	if (ctx) {
 		/*
 		 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 		 * can't get swapped on us any more.
 		 */
 		raw_spin_lock_irqsave(&ctx->lock, *flags);
-		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
 			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 			goto retry;
 		}
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
  * can't get swapped to another task.  This also increments its
  * reference count so that the context can't get freed.
  */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
 {
 	struct perf_event_context *ctx;
 	unsigned long flags;
 
-	ctx = perf_lock_task_context(task, &flags);
+	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		++ctx->pin_count;
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	}
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
+	if (!ctx->nr_events)
+		perf_pmu_rotate_start(ctx->pmu);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
 {
 	struct perf_event *group_leader = event->group_leader;
 
-	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+	/*
+	 * We can have double attach due to group movement in perf_event_open.
+	 */
+	if (event->attach_state & PERF_ATTACH_GROUP)
+		return;
+
 	event->attach_state |= PERF_ATTACH_GROUP;
 
 	if (group_leader == event)
@@ -408,8 +417,8 @@ event_filter_match(struct perf_event *event)
 	return event->cpu == -1 || event->cpu == smp_processor_id();
 }
 
-static void
-event_sched_out(struct perf_event *event,
+static int
+__event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
@@ -428,15 +437,14 @@ event_sched_out(struct perf_event *event,
 	}
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
-		return;
+		return 0;
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	if (event->pending_disable) {
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = ctx->time;
-	event->pmu->disable(event);
+	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
 	if (!is_software_event(event))
@@ -444,6 +452,19 @@ event_sched_out(struct perf_event *event,
 	ctx->nr_active--;
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
+	return 1;
+}
+
+static void
+event_sched_out(struct perf_event *event,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_event_context *ctx)
+{
+	int ret;
+
+	ret = __event_sched_out(event, cpuctx, ctx);
+	if (ret)
+		event->tstamp_stopped = ctx->time;
 }
 
 static void
@@ -466,6 +487,12 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -474,9 +501,9 @@ group_sched_out(struct perf_event *group_event,
  */
 static void __perf_event_remove_from_context(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -487,27 +514,11 @@ static void __perf_event_remove_from_context(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * events on a global level.
-	 */
-	perf_disable();
 
 	event_sched_out(event, cpuctx, ctx);
 
 	list_del_event(event, ctx);
 
-	if (!ctx->task) {
-		/*
-		 * Allow more per task events with respect to the
-		 * reservation:
-		 */
-		cpuctx->max_pertask =
-			min(perf_max_events - ctx->nr_events,
-			    perf_max_events - perf_reserved_percpu);
-	}
-
-	perf_enable();
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -572,8 +583,8 @@ retry:
 static void __perf_event_disable(void *info)
 {
 	struct perf_event *event = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a per-task event, need to check whether this
@@ -628,7 +639,7 @@ void perf_event_disable(struct perf_event *event)
 		return;
 	}
 
- retry:
+retry:
 	task_oncpu_function_call(task, __perf_event_disable, event);
 
 	raw_spin_lock_irq(&ctx->lock);
@@ -653,7 +664,7 @@ void perf_event_disable(struct perf_event *event)
 }
 
 static int
-event_sched_in(struct perf_event *event,
+__event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_event_context *ctx)
 {
@@ -667,14 +678,12 @@ event_sched_in(struct perf_event *event,
 	 */
 	smp_wmb();
 
-	if (event->pmu->enable(event)) {
+	if (event->pmu->add(event, PERF_EF_START)) {
 		event->state = PERF_EVENT_STATE_INACTIVE;
 		event->oncpu = -1;
 		return -EAGAIN;
 	}
 
-	event->tstamp_running += ctx->time - event->tstamp_stopped;
-
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -685,28 +694,56 @@ event_sched_in(struct perf_event *event,
 	return 0;
 }
 
+static inline int
+event_sched_in(struct perf_event *event,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_event_context *ctx)
+{
+	int ret = __event_sched_in(event, cpuctx, ctx);
+	if (ret)
+		return ret;
+	event->tstamp_running += ctx->time - event->tstamp_stopped;
+	return 0;
+}
+
+static void
+group_commit_event_sched_in(struct perf_event *group_event,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+	u64 now = ctx->time;
+
+	group_event->tstamp_running += now - group_event->tstamp_stopped;
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+		event->tstamp_running += now - event->tstamp_stopped;
+	}
+}
+
 static int
 group_sched_in(struct perf_event *group_event,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_event_context *ctx)
 {
 	struct perf_event *event, *partial_group = NULL;
-	const struct pmu *pmu = group_event->pmu;
-	bool txn = false;
+	struct pmu *pmu = group_event->pmu;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
 
-	/* Check if group transaction availabe */
-	if (pmu->start_txn)
-		txn = true;
+	pmu->start_txn(pmu);
 
-	if (txn)
-		pmu->start_txn(pmu);
-
-	if (event_sched_in(group_event, cpuctx, ctx)) {
-		if (txn)
-			pmu->cancel_txn(pmu);
+	/*
+	 * use __event_sched_in() to delay updating tstamp_running
+	 * until the transaction is committed. In case of failure
+	 * we will keep an unmodified tstamp_running which is a
+	 * requirement to get correct timing information
+	 */
+	if (__event_sched_in(group_event, cpuctx, ctx)) {
+		pmu->cancel_txn(pmu);
 		return -EAGAIN;
 	}
 
@@ -714,29 +751,33 @@ group_sched_in(struct perf_event *group_event,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-		if (event_sched_in(event, cpuctx, ctx)) {
+		if (__event_sched_in(event, cpuctx, ctx)) {
 			partial_group = event;
 			goto group_error;
 		}
 	}
 
-	if (!txn || !pmu->commit_txn(pmu))
+	if (!pmu->commit_txn(pmu)) {
+		/* commit tstamp_running */
+		group_commit_event_sched_in(group_event, cpuctx, ctx);
 		return 0;
-
+	}
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
+	 *
+	 * use __event_sched_out() to avoid updating tstamp_stopped
+	 * because the event never actually ran
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 		if (event == partial_group)
 			break;
-		event_sched_out(event, cpuctx, ctx);
+		__event_sched_out(event, cpuctx, ctx);
 	}
-	event_sched_out(group_event, cpuctx, ctx);
+	__event_sched_out(group_event, cpuctx, ctx);
 
-	if (txn)
-		pmu->cancel_txn(pmu);
+	pmu->cancel_txn(pmu);
 
 	return -EAGAIN;
 }
@@ -789,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event,
  */
 static void __perf_install_in_context(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	int err;
 
 	/*
@@ -812,12 +853,6 @@ static void __perf_install_in_context(void *info)
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * events on a global level. NOP for non NMI based events.
-	 */
-	perf_disable();
-
 	add_event_to_ctx(event, ctx);
 
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -855,12 +890,7 @@ static void __perf_install_in_context(void *info)
 		}
 	}
 
-	if (!err && !ctx->task && cpuctx->max_pertask)
-		cpuctx->max_pertask--;
-
- unlock:
-	perf_enable();
-
+unlock:
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -883,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
+	event->ctx = ctx;
+
 	if (!task) {
 		/*
 		 * Per cpu events are installed via an smp call and
@@ -931,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	event->tstamp_enabled = ctx->time - event->total_time_enabled;
-	list_for_each_entry(sub, &event->sibling_list, group_entry)
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 			sub->tstamp_enabled =
 				ctx->time - sub->total_time_enabled;
+		}
+	}
 }
 
 /*
@@ -943,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 static void __perf_event_enable(void *info)
 {
 	struct perf_event *event = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	int err;
 
 	/*
@@ -979,12 +1013,10 @@ static void __perf_event_enable(void *info)
 	if (!group_can_go_on(event, cpuctx, 1)) {
 		err = -EEXIST;
 	} else {
-		perf_disable();
 		if (event == leader)
 			err = group_sched_in(event, cpuctx, ctx);
 		else
 			err = event_sched_in(event, cpuctx, ctx);
-		perf_enable();
 	}
 
 	if (err) {
@@ -1000,7 +1032,7 @@ static void __perf_event_enable(void *info)
 		}
 	}
 
- unlock:
+unlock:
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1041,7 +1073,7 @@ void perf_event_enable(struct perf_event *event)
 	if (event->state == PERF_EVENT_STATE_ERROR)
 		event->state = PERF_EVENT_STATE_OFF;
 
- retry:
+retry:
 	raw_spin_unlock_irq(&ctx->lock);
 	task_oncpu_function_call(task, __perf_event_enable, event);
 
@@ -1061,7 +1093,7 @@ void perf_event_enable(struct perf_event *event)
 	if (event->state == PERF_EVENT_STATE_OFF)
 		__perf_event_mark_enabled(event, ctx);
 
- out:
+out:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -1092,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	struct perf_event *event;
 
 	raw_spin_lock(&ctx->lock);
+	perf_pmu_disable(ctx->pmu);
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_events))
 		goto out;
 	update_context_time(ctx);
 
-	perf_disable();
 	if (!ctx->nr_active)
-		goto out_enable;
+		goto out;
 
-	if (event_type & EVENT_PINNED)
+	if (event_type & EVENT_PINNED) {
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
+	}
 
-	if (event_type & EVENT_FLEXIBLE)
+	if (event_type & EVENT_FLEXIBLE) {
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
-
- out_enable:
-	perf_enable();
- out:
+	}
+out:
+	perf_pmu_enable(ctx->pmu);
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1209,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	}
 }
 
-/*
- * Called from scheduler to remove the events of the current task,
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-				 struct task_struct *next)
+void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+				  struct task_struct *next)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
 	struct perf_event_context *next_ctx;
 	struct perf_event_context *parent;
+	struct perf_cpu_context *cpuctx;
 	int do_switch = 1;
 
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+	if (likely(!ctx))
+		return;
 
-	if (likely(!ctx || !cpuctx->task_ctx))
+	cpuctx = __get_cpu_context(ctx);
+	if (!cpuctx->task_ctx)
 		return;
 
 	rcu_read_lock();
 	parent = rcu_dereference(ctx->parent_ctx);
-	next_ctx = next->perf_event_ctxp;
+	next_ctx = next->perf_event_ctxp[ctxn];
 	if (parent && next_ctx &&
 	    rcu_dereference(next_ctx->parent_ctx) == parent) {
 		/*
@@ -1255,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task,
 			 * XXX do we need a memory barrier of sorts
 			 * wrt to rcu_dereference() of perf_event_ctxp
 			 */
-			task->perf_event_ctxp = next_ctx;
-			next->perf_event_ctxp = ctx;
+			task->perf_event_ctxp[ctxn] = next_ctx;
+			next->perf_event_ctxp[ctxn] = ctx;
 			ctx->task = next;
 			next_ctx->task = task;
 			do_switch = 0;
@@ -1274,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task,
 	}
 }
 
+#define for_each_task_context_nr(ctxn)					\
+	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void __perf_event_task_sched_out(struct task_struct *task,
+				 struct task_struct *next)
+{
+	int ctxn;
+
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+	for_each_task_context_nr(ctxn)
+		perf_event_context_sched_out(task, ctxn, next);
+}
+
 static void task_ctx_sched_out(struct perf_event_context *ctx,
 			       enum event_type_t event_type)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	if (!cpuctx->task_ctx)
 		return;
@@ -1292,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
 /*
  * Called with IRQs disabled
  */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
-{
-	task_ctx_sched_out(ctx, EVENT_ALL);
-}
-
-/*
- * Called with IRQs disabled
- */
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			      enum event_type_t event_type)
 {
@@ -1350,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
-		if (group_can_go_on(event, cpuctx, can_add_hw))
+		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
+		}
 	}
 }
 
@@ -1368,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx,
 
 	ctx->timestamp = perf_clock();
 
-	perf_disable();
-
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
@@ -1381,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 	if (event_type & EVENT_FLEXIBLE)
 		ctx_flexible_sched_in(ctx, cpuctx);
 
-	perf_enable();
- out:
+out:
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1394,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 	ctx_sched_in(ctx, cpuctx, event_type);
 }
 
-static void task_ctx_sched_in(struct task_struct *task,
+static void task_ctx_sched_in(struct perf_event_context *ctx,
 			      enum event_type_t event_type)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_cpu_context *cpuctx;
 
-	if (likely(!ctx))
-		return;
+       	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
 		return;
+
 	ctx_sched_in(ctx, cpuctx, event_type);
 	cpuctx->task_ctx = ctx;
 }
-/*
- * Called from scheduler to add the events of the current task
- * with interrupts disabled.
- *
- * We restore the event value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * keep the event running.
- */
-void perf_event_task_sched_in(struct task_struct *task)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_event_context *ctx = task->perf_event_ctxp;
 
-	if (likely(!ctx))
-		return;
+void perf_event_context_sched_in(struct perf_event_context *ctx)
+{
+	struct perf_cpu_context *cpuctx;
 
+	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
 		return;
 
-	perf_disable();
-
+	perf_pmu_disable(ctx->pmu);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -1444,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task)
 
 	cpuctx->task_ctx = ctx;
 
-	perf_enable();
+	/*
+	 * Since these rotations are per-cpu, we need to ensure the
+	 * cpu-context we got scheduled on is actually rotating.
+	 */
+	perf_pmu_rotate_start(ctx->pmu);
+	perf_pmu_enable(ctx->pmu);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void __perf_event_task_sched_in(struct task_struct *task)
+{
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	for_each_task_context_nr(ctxn) {
+		ctx = task->perf_event_ctxp[ctxn];
+		if (likely(!ctx))
+			continue;
+
+		perf_event_context_sched_in(ctx);
+	}
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1524,22 +1577,6 @@ do {					\
 	return div64_u64(dividend, divisor);
 }
 
-static void perf_event_stop(struct perf_event *event)
-{
-	if (!event->pmu->stop)
-		return event->pmu->disable(event);
-
-	return event->pmu->stop(event);
-}
-
-static int perf_event_start(struct perf_event *event)
-{
-	if (!event->pmu->start)
-		return event->pmu->enable(event);
-
-	return event->pmu->start(event);
-}
-
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 	hwc->sample_period = sample_period;
 
 	if (local64_read(&hwc->period_left) > 8*sample_period) {
-		perf_disable();
-		perf_event_stop(event);
+		event->pmu->stop(event, PERF_EF_UPDATE);
 		local64_set(&hwc->period_left, 0);
-		perf_event_start(event);
-		perf_enable();
+		event->pmu->start(event, PERF_EF_RELOAD);
 	}
 }
 
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 {
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
@@ -1592,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(event, 1);
-			perf_disable();
-			event->pmu->unthrottle(event);
-			perf_enable();
+			event->pmu->start(event, 0);
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
-		perf_disable();
 		event->pmu->read(event);
 		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
 		hwc->freq_count_stamp = now;
 
 		if (delta > 0)
-			perf_adjust_period(event, TICK_NSEC, delta);
-		perf_enable();
+			perf_adjust_period(event, period, delta);
 	}
 	raw_spin_unlock(&ctx->lock);
 }
@@ -1626,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	raw_spin_unlock(&ctx->lock);
 }
 
-void perf_event_task_tick(struct task_struct *curr)
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
-	int rotate = 0;
-
-	if (!atomic_read(&nr_events))
-		return;
+	u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
+	struct perf_event_context *ctx = NULL;
+	int rotate = 0, remove = 1;
 
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-	if (cpuctx->ctx.nr_events &&
-	    cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-		rotate = 1;
+	if (cpuctx->ctx.nr_events) {
+		remove = 0;
+		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+			rotate = 1;
+	}
 
-	ctx = curr->perf_event_ctxp;
-	if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
-		rotate = 1;
+	ctx = cpuctx->task_ctx;
+	if (ctx && ctx->nr_events) {
+		remove = 0;
+		if (ctx->nr_events != ctx->nr_active)
+			rotate = 1;
+	}
 
-	perf_ctx_adjust_freq(&cpuctx->ctx);
+	perf_pmu_disable(cpuctx->ctx.pmu);
+	perf_ctx_adjust_freq(&cpuctx->ctx, interval);
 	if (ctx)
-		perf_ctx_adjust_freq(ctx);
+		perf_ctx_adjust_freq(ctx, interval);
 
 	if (!rotate)
-		return;
+		goto done;
 
-	perf_disable();
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
 		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1662,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr)
 
 	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-	perf_enable();
+		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
+
+done:
+	if (remove)
+		list_del_init(&cpuctx->rotation_list);
+
+	perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
+void perf_event_task_tick(void)
+{
+	struct list_head *head = &__get_cpu_var(rotation_list);
+	struct perf_cpu_context *cpuctx, *tmp;
+
+	WARN_ON(!irqs_disabled());
+
+	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+		if (cpuctx->jiffies_interval == 1 ||
+				!(jiffies % cpuctx->jiffies_interval))
+			perf_rotate_context(cpuctx);
+	}
 }
 
 static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event,
  * Enable all of a task's events that have been marked enable-on-exec.
  * This expects task == current.
  */
-static void perf_event_enable_on_exec(struct task_struct *task)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 {
-	struct perf_event_context *ctx;
 	struct perf_event *event;
 	unsigned long flags;
 	int enabled = 0;
 	int ret;
 
 	local_irq_save(flags);
-	ctx = task->perf_event_ctxp;
 	if (!ctx || !ctx->nr_events)
 		goto out;
 
-	__perf_event_task_sched_out(ctx);
+	task_ctx_sched_out(ctx, EVENT_ALL);
 
 	raw_spin_lock(&ctx->lock);
 
@@ -1722,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_task_sched_in(task);
- out:
+	perf_event_context_sched_in(ctx);
+out:
 	local_irq_restore(flags);
 }
 
@@ -1732,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
  */
 static void __perf_event_read(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -1773,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event)
 		unsigned long flags;
 
 		raw_spin_lock_irqsave(&ctx->lock, flags);
-		update_context_time(ctx);
+		/*
+		 * may read while context is not active
+		 * (e.g., thread is blocked), in that case
+		 * we cannot update context time
+		 */
+		if (ctx->is_active)
+			update_context_time(ctx);
 		update_event_times(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
@@ -1782,11 +1842,219 @@ static u64 perf_event_read(struct perf_event *event)
 }
 
 /*
- * Initialize the perf_event context in a task_struct:
+ * Callchain support
  */
+
+struct callchain_cpus_entries {
+	struct rcu_head			rcu_head;
+	struct perf_callchain_entry	*cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+				  struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+				struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+	struct callchain_cpus_entries *entries;
+	int cpu;
+
+	entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+	for_each_possible_cpu(cpu)
+		kfree(entries->cpu_entries[cpu]);
+
+	kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+	struct callchain_cpus_entries *entries;
+
+	entries = callchain_cpus_entries;
+	rcu_assign_pointer(callchain_cpus_entries, NULL);
+	call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+	int cpu;
+	int size;
+	struct callchain_cpus_entries *entries;
+
+	/*
+	 * We can't use the percpu allocation API for data that can be
+	 * accessed from NMI. Use a temporary manual per cpu allocation
+	 * until that gets sorted out.
+	 */
+	size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+		num_possible_cpus();
+
+	entries = kzalloc(size, GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+	for_each_possible_cpu(cpu) {
+		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+							 cpu_to_node(cpu));
+		if (!entries->cpu_entries[cpu])
+			goto fail;
+	}
+
+	rcu_assign_pointer(callchain_cpus_entries, entries);
+
+	return 0;
+
+fail:
+	for_each_possible_cpu(cpu)
+		kfree(entries->cpu_entries[cpu]);
+	kfree(entries);
+
+	return -ENOMEM;
+}
+
+static int get_callchain_buffers(void)
+{
+	int err = 0;
+	int count;
+
+	mutex_lock(&callchain_mutex);
+
+	count = atomic_inc_return(&nr_callchain_events);
+	if (WARN_ON_ONCE(count < 1)) {
+		err = -EINVAL;
+		goto exit;
+	}
+
+	if (count > 1) {
+		/* If the allocation failed, give up */
+		if (!callchain_cpus_entries)
+			err = -ENOMEM;
+		goto exit;
+	}
+
+	err = alloc_callchain_buffers();
+	if (err)
+		release_callchain_buffers();
+exit:
+	mutex_unlock(&callchain_mutex);
+
+	return err;
+}
+
+static void put_callchain_buffers(void)
+{
+	if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+		release_callchain_buffers();
+		mutex_unlock(&callchain_mutex);
+	}
+}
+
+static int get_recursion_context(int *recursion)
+{
+	int rctx;
+
+	if (in_nmi())
+		rctx = 3;
+	else if (in_irq())
+		rctx = 2;
+	else if (in_softirq())
+		rctx = 1;
+	else
+		rctx = 0;
+
+	if (recursion[rctx])
+		return -1;
+
+	recursion[rctx]++;
+	barrier();
+
+	return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+	barrier();
+	recursion[rctx]--;
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+	int cpu;
+	struct callchain_cpus_entries *entries;
+
+	*rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+	if (*rctx == -1)
+		return NULL;
+
+	entries = rcu_dereference(callchain_cpus_entries);
+	if (!entries)
+		return NULL;
+
+	cpu = smp_processor_id();
+
+	return &entries->cpu_entries[cpu][*rctx];
+}
+
 static void
-__perf_event_init_context(struct perf_event_context *ctx,
-			    struct task_struct *task)
+put_callchain_entry(int rctx)
+{
+	put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	int rctx;
+	struct perf_callchain_entry *entry;
+
+
+	entry = get_callchain_entry(&rctx);
+	if (rctx == -1)
+		return NULL;
+
+	if (!entry)
+		goto exit_put;
+
+	entry->nr = 0;
+
+	if (!user_mode(regs)) {
+		perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+		perf_callchain_kernel(entry, regs);
+		if (current->mm)
+			regs = task_pt_regs(current);
+		else
+			regs = NULL;
+	}
+
+	if (regs) {
+		perf_callchain_store(entry, PERF_CONTEXT_USER);
+		perf_callchain_user(entry, regs);
+	}
+
+exit_put:
+	put_callchain_entry(rctx);
+
+	return entry;
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
 {
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
@@ -1794,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
-	ctx->task = task;
 }
 
-static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
 {
 	struct perf_event_context *ctx;
-	struct perf_cpu_context *cpuctx;
-	struct task_struct *task;
-	unsigned long flags;
-	int err;
-
-	if (pid == -1 && cpu != -1) {
-		/* Must be root to operate on a CPU event: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return ERR_PTR(-EACCES);
 
-		if (cpu < 0 || cpu >= nr_cpumask_bits)
-			return ERR_PTR(-EINVAL);
+	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
 
-		/*
-		 * We could be clever and allow to attach a event to an
-		 * offline CPU and activate it when the CPU comes up, but
-		 * that's for later.
-		 */
-		if (!cpu_online(cpu))
-			return ERR_PTR(-ENODEV);
+	__perf_event_init_context(ctx);
+	if (task) {
+		ctx->task = task;
+		get_task_struct(task);
+	}
+	ctx->pmu = pmu;
 
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		ctx = &cpuctx->ctx;
-		get_ctx(ctx);
+	return ctx;
+}
 
-		return ctx;
-	}
+static struct task_struct *
+find_lively_task_by_vpid(pid_t vpid)
+{
+	struct task_struct *task;
+	int err;
 
 	rcu_read_lock();
-	if (!pid)
+	if (!vpid)
 		task = current;
 	else
-		task = find_task_by_vpid(pid);
+		task = find_task_by_vpid(vpid);
 	if (task)
 		get_task_struct(task);
 	rcu_read_unlock();
@@ -1852,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto errout;
 
- retry:
-	ctx = perf_lock_task_context(task, &flags);
+	return task;
+errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
+
+}
+
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+{
+	struct perf_event_context *ctx;
+	struct perf_cpu_context *cpuctx;
+	unsigned long flags;
+	int ctxn, err;
+
+	if (!task && cpu != -1) {
+		/* Must be root to operate on a CPU event: */
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EACCES);
+
+		if (cpu < 0 || cpu >= nr_cpumask_bits)
+			return ERR_PTR(-EINVAL);
+
+		/*
+		 * We could be clever and allow to attach a event to an
+		 * offline CPU and activate it when the CPU comes up, but
+		 * that's for later.
+		 */
+		if (!cpu_online(cpu))
+			return ERR_PTR(-ENODEV);
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
+		get_ctx(ctx);
+
+		return ctx;
+	}
+
+	err = -EINVAL;
+	ctxn = pmu->task_ctx_nr;
+	if (ctxn < 0)
+		goto errout;
+
+retry:
+	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		unclone_ctx(ctx);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
 	if (!ctx) {
-		ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+		ctx = alloc_perf_context(pmu, task);
 		err = -ENOMEM;
 		if (!ctx)
 			goto errout;
-		__perf_event_init_context(ctx, task);
+
 		get_ctx(ctx);
-		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+
+		if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
 			/*
 			 * We raced with some other task; use
 			 * the context they set.
 			 */
+			put_task_struct(task);
 			kfree(ctx);
 			goto retry;
 		}
-		get_task_struct(task);
 	}
 
-	put_task_struct(task);
 	return ctx;
 
- errout:
-	put_task_struct(task);
+errout:
 	return ERR_PTR(err);
 }
 
@@ -1898,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static void perf_pending_sync(struct perf_event *event);
 static void perf_buffer_put(struct perf_buffer *buffer);
 
 static void free_event(struct perf_event *event)
 {
-	perf_pending_sync(event);
+	irq_work_sync(&event->pending);
 
 	if (!event->parent) {
-		atomic_dec(&nr_events);
+		if (event->attach_state & PERF_ATTACH_TASK)
+			jump_label_dec(&perf_task_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_dec(&nr_comm_events);
 		if (event->attr.task)
 			atomic_dec(&nr_task_events);
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+			put_callchain_buffers();
 	}
 
 	if (event->buffer) {
@@ -1923,7 +2228,9 @@ static void free_event(struct perf_event *event)
 	if (event->destroy)
 		event->destroy(event);
 
-	put_ctx(event->ctx);
+	if (event->ctx)
+		put_ctx(event->ctx);
+
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
 
@@ -2342,6 +2649,9 @@ int perf_event_task_disable(void)
 
 static int perf_event_index(struct perf_event *event)
 {
+	if (event->hw.state & PERF_HES_STOPPED)
+		return 0;
+
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return 0;
 
@@ -2845,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event)
 	}
 }
 
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
-static void perf_pending_event(struct perf_pending_entry *entry)
+static void perf_pending_event(struct irq_work *entry)
 {
 	struct perf_event *event = container_of(entry,
 			struct perf_event, pending);
@@ -2870,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
 	}
 }
 
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-	PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-			       void (*func)(struct perf_pending_entry *))
-{
-	struct perf_pending_entry **head;
-
-	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-		return;
-
-	entry->func = func;
-
-	head = &get_cpu_var(perf_pending_head);
-
-	do {
-		entry->next = *head;
-	} while (cmpxchg(head, entry->next, entry) != entry->next);
-
-	set_perf_event_pending();
-
-	put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-	struct perf_pending_entry *list;
-	int nr = 0;
-
-	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-	while (list != PENDING_TAIL) {
-		void (*func)(struct perf_pending_entry *);
-		struct perf_pending_entry *entry = list;
-
-		list = list->next;
-
-		func = entry->func;
-		entry->next = NULL;
-		/*
-		 * Ensure we observe the unqueue before we issue the wakeup,
-		 * so that we won't be waiting forever.
-		 * -- see perf_not_pending().
-		 */
-		smp_wmb();
-
-		func(entry);
-		nr++;
-	}
-
-	return nr;
-}
-
-static inline int perf_not_pending(struct perf_event *event)
-{
-	/*
-	 * If we flush on whatever cpu we run, there is a chance we don't
-	 * need to wait.
-	 */
-	get_cpu();
-	__perf_pending_run();
-	put_cpu();
-
-	/*
-	 * Ensure we see the proper queue state before going to sleep
-	 * so that we do not miss the wakeup. -- see perf_pending_handle()
-	 */
-	smp_rmb();
-	return event->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_event *event)
-{
-	wait_event(event->waitq, perf_not_pending(event));
-}
-
-void perf_event_do_pending(void)
-{
-	__perf_pending_run();
-}
-
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	return NULL;
-}
-
-
 /*
  * We assume there is only KVM supporting the callbacks.
  * Later on, we might change it to a list if there is
@@ -3012,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 
 	if (handle->nmi) {
 		handle->event->pending_wakeup = 1;
-		perf_pending_queue(&handle->event->pending,
-				   perf_pending_event);
+		irq_work_queue(&handle->event->pending);
 	} else
 		perf_event_wakeup(handle->event);
 }
@@ -3069,7 +3276,7 @@ again:
 	if (handle->wakeup != local_read(&buffer->wakeup))
 		perf_output_wakeup(handle);
 
- out:
+out:
 	preempt_enable();
 }
 
@@ -3457,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 
+	/* protect the callchain buffers */
+	rcu_read_lock();
+
 	perf_prepare_sample(&header, data, event, regs);
 
 	if (perf_output_begin(&handle, event, header.size, nmi, 1))
-		return;
+		goto exit;
 
 	perf_output_sample(&handle, &header, data, event);
 
 	perf_output_end(&handle);
+
+exit:
+	rcu_read_unlock();
 }
 
 /*
@@ -3578,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 static void perf_event_task_event(struct perf_task_event *task_event)
 {
 	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx = task_event->task_ctx;
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int ctxn;
 
 	rcu_read_lock();
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_task_ctx(&cpuctx->ctx, task_event);
-	if (!ctx)
-		ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_event_task_ctx(ctx, task_event);
-	put_cpu_var(perf_cpu_context);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		perf_event_task_ctx(&cpuctx->ctx, task_event);
+
+		ctx = task_event->task_ctx;
+		if (!ctx) {
+			ctxn = pmu->task_ctx_nr;
+			if (ctxn < 0)
+				goto next;
+			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		}
+		if (ctx)
+			perf_event_task_ctx(ctx, task_event);
+next:
+		put_cpu_ptr(pmu->pmu_cpu_context);
+	}
 	rcu_read_unlock();
 }
 
@@ -3692,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	unsigned int size;
 	char comm[TASK_COMM_LEN];
+	unsigned int size;
+	struct pmu *pmu;
+	int ctxn;
 
 	memset(comm, 0, sizeof(comm));
 	strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3705,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
 	rcu_read_lock();
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_event_comm_ctx(ctx, comm_event);
-	put_cpu_var(perf_cpu_context);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+
+		ctxn = pmu->task_ctx_nr;
+		if (ctxn < 0)
+			goto next;
+
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (ctx)
+			perf_event_comm_ctx(ctx, comm_event);
+next:
+		put_cpu_ptr(pmu->pmu_cpu_context);
+	}
 	rcu_read_unlock();
 }
 
 void perf_event_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	for_each_task_context_nr(ctxn) {
+		ctx = task->perf_event_ctxp[ctxn];
+		if (!ctx)
+			continue;
 
-	if (task->perf_event_ctxp)
-		perf_event_enable_on_exec(task);
+		perf_event_enable_on_exec(ctx);
+	}
 
 	if (!atomic_read(&nr_comm_events))
 		return;
@@ -3821,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 	char tmp[16];
 	char *buf = NULL;
 	const char *name;
+	struct pmu *pmu;
+	int ctxn;
 
 	memset(tmp, 0, sizeof(tmp));
 
@@ -3873,12 +4116,23 @@ got_name:
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
 	rcu_read_lock();
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
-	put_cpu_var(perf_cpu_context);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
+					vma->vm_flags & VM_EXEC);
+
+		ctxn = pmu->task_ctx_nr;
+		if (ctxn < 0)
+			goto next;
+
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (ctx) {
+			perf_event_mmap_ctx(ctx, mmap_event,
+					vma->vm_flags & VM_EXEC);
+		}
+next:
+		put_cpu_ptr(pmu->pmu_cpu_context);
+	}
 	rcu_read_unlock();
 
 	kfree(buf);
@@ -3960,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 	struct hw_perf_event *hwc = &event->hw;
 	int ret = 0;
 
-	throttle = (throttle && event->pmu->unthrottle != NULL);
-
 	if (!throttle) {
 		hwc->interrupts++;
 	} else {
@@ -4004,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 		event->pending_kill = POLL_HUP;
 		if (nmi) {
 			event->pending_disable = 1;
-			perf_pending_queue(&event->pending,
-					   perf_pending_event);
+			irq_work_queue(&event->pending);
 		} else
 			perf_event_disable(event);
 	}
@@ -4029,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
  * Generic software event infrastructure
  */
 
+struct swevent_htable {
+	struct swevent_hlist		*swevent_hlist;
+	struct mutex			hlist_mutex;
+	int				hlist_refcount;
+
+	/* Recursion avoidance in each contexts */
+	int				recursion[PERF_NR_CONTEXTS];
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
 /*
  * We directly increment event->count and keep a second value in
  * event->hw.period_left to count intervals. This period event
@@ -4086,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 	}
 }
 
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
 			       int nmi, struct perf_sample_data *data,
 			       struct pt_regs *regs)
 {
@@ -4112,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 static int perf_exclude_event(struct perf_event *event,
 			      struct pt_regs *regs)
 {
+	if (event->hw.state & PERF_HES_STOPPED)
+		return 0;
+
 	if (regs) {
 		if (event->attr.exclude_user && user_mode(regs))
 			return 1;
@@ -4158,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 
 /* For the read side: events when they trigger */
 static inline struct hlist_head *
-find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
 {
 	struct swevent_hlist *hlist;
 
-	hlist = rcu_dereference(ctx->swevent_hlist);
+	hlist = rcu_dereference(swhash->swevent_hlist);
 	if (!hlist)
 		return NULL;
 
@@ -4171,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
 
 /* For the event head insertion and removal in the hlist */
 static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 {
 	struct swevent_hlist *hlist;
 	u32 event_id = event->attr.config;
@@ -4182,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
 	 * and release. Which makes the protected version suitable here.
 	 * The context lock guarantees that.
 	 */
-	hlist = rcu_dereference_protected(ctx->swevent_hlist,
+	hlist = rcu_dereference_protected(swhash->swevent_hlist,
 					  lockdep_is_held(&event->ctx->lock));
 	if (!hlist)
 		return NULL;
@@ -4195,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 				    struct perf_sample_data *data,
 				    struct pt_regs *regs)
 {
-	struct perf_cpu_context *cpuctx;
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 	struct perf_event *event;
 	struct hlist_node *node;
 	struct hlist_head *head;
 
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-
 	rcu_read_lock();
-
-	head = find_swevent_head_rcu(cpuctx, type, event_id);
-
+	head = find_swevent_head_rcu(swhash, type, event_id);
 	if (!head)
 		goto end;
 
 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
-			perf_swevent_add(event, nr, nmi, data, regs);
+			perf_swevent_event(event, nr, nmi, data, regs);
 	}
 end:
 	rcu_read_unlock();
@@ -4219,33 +4480,17 @@ end:
 
 int perf_swevent_get_recursion_context(void)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	int rctx;
-
-	if (in_nmi())
-		rctx = 3;
-	else if (in_irq())
-		rctx = 2;
-	else if (in_softirq())
-		rctx = 1;
-	else
-		rctx = 0;
-
-	if (cpuctx->recursion[rctx])
-		return -1;
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 
-	cpuctx->recursion[rctx]++;
-	barrier();
-
-	return rctx;
+	return get_recursion_context(swhash->recursion);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
 void inline perf_swevent_put_recursion_context(int rctx)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	barrier();
-	cpuctx->recursion[rctx]--;
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+
+	put_recursion_context(swhash->recursion, rctx);
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4271,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event)
 {
 }
 
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
 {
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 	struct hw_perf_event *hwc = &event->hw;
-	struct perf_cpu_context *cpuctx;
 	struct hlist_head *head;
 
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-
 	if (hwc->sample_period) {
 		hwc->last_period = hwc->sample_period;
 		perf_swevent_set_period(event);
 	}
 
-	head = find_swevent_head(cpuctx, event);
+	hwc->state = !(flags & PERF_EF_START);
+
+	head = find_swevent_head(swhash, event);
 	if (WARN_ON_ONCE(!head))
 		return -EINVAL;
 
@@ -4293,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event)
 	return 0;
 }
 
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
 {
 	hlist_del_rcu(&event->hlist_entry);
 }
 
-static void perf_swevent_void(struct perf_event *event)
-{
-}
-
-static int perf_swevent_int(struct perf_event *event)
-{
-	return 0;
-}
-
-static const struct pmu perf_ops_generic = {
-	.enable		= perf_swevent_enable,
-	.disable	= perf_swevent_disable,
-	.start		= perf_swevent_int,
-	.stop		= perf_swevent_void,
-	.read		= perf_swevent_read,
-	.unthrottle	= perf_swevent_void, /* hwc->interrupts already reset */
-};
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+static void perf_swevent_start(struct perf_event *event, int flags)
 {
-	enum hrtimer_restart ret = HRTIMER_RESTART;
-	struct perf_sample_data data;
-	struct pt_regs *regs;
-	struct perf_event *event;
-	u64 period;
-
-	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-	event->pmu->read(event);
-
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
-	regs = get_irq_regs();
-
-	if (regs && !perf_exclude_event(event, regs)) {
-		if (!(event->attr.exclude_idle && current->pid == 0))
-			if (perf_event_overflow(event, 0, &data, regs))
-				ret = HRTIMER_NORESTART;
-	}
-
-	period = max_t(u64, 10000, event->hw.sample_period);
-	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-	return ret;
+	event->hw.state = 0;
 }
 
-static void perf_swevent_start_hrtimer(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
 {
-	struct hw_perf_event *hwc = &event->hw;
-
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swevent_hrtimer;
-	if (hwc->sample_period) {
-		u64 period;
-
-		if (hwc->remaining) {
-			if (hwc->remaining < 0)
-				period = 10000;
-			else
-				period = hwc->remaining;
-			hwc->remaining = 0;
-		} else {
-			period = max_t(u64, 10000, hwc->sample_period);
-		}
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->sample_period) {
-		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-		hwc->remaining = ktime_to_ns(remaining);
-
-		hrtimer_cancel(&hwc->hrtimer);
-	}
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-	int cpu = raw_smp_processor_id();
-	s64 prev;
-	u64 now;
-
-	now = cpu_clock(cpu);
-	prev = local64_xchg(&event->hw.prev_count, now);
-	local64_add(now - prev, &event->count);
-}
-
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int cpu = raw_smp_processor_id();
-
-	local64_set(&hwc->prev_count, cpu_clock(cpu));
-	perf_swevent_start_hrtimer(event);
-
-	return 0;
-}
-
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-	perf_swevent_cancel_hrtimer(event);
-	cpu_clock_perf_event_update(event);
-}
-
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-	cpu_clock_perf_event_update(event);
-}
-
-static const struct pmu perf_ops_cpu_clock = {
-	.enable		= cpu_clock_perf_event_enable,
-	.disable	= cpu_clock_perf_event_disable,
-	.read		= cpu_clock_perf_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-	u64 prev;
-	s64 delta;
-
-	prev = local64_xchg(&event->hw.prev_count, now);
-	delta = now - prev;
-	local64_add(delta, &event->count);
-}
-
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 now;
-
-	now = event->ctx->time;
-
-	local64_set(&hwc->prev_count, now);
-
-	perf_swevent_start_hrtimer(event);
-
-	return 0;
-}
-
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-	perf_swevent_cancel_hrtimer(event);
-	task_clock_perf_event_update(event, event->ctx->time);
-
-}
-
-static void task_clock_perf_event_read(struct perf_event *event)
-{
-	u64 time;
-
-	if (!in_nmi()) {
-		update_context_time(event->ctx);
-		time = event->ctx->time;
-	} else {
-		u64 now = perf_clock();
-		u64 delta = now - event->ctx->timestamp;
-		time = event->ctx->time + delta;
-	}
-
-	task_clock_perf_event_update(event, time);
+	event->hw.state = PERF_HES_STOPPED;
 }
 
-static const struct pmu perf_ops_task_clock = {
-	.enable		= task_clock_perf_event_enable,
-	.disable	= task_clock_perf_event_disable,
-	.read		= task_clock_perf_event_read,
-};
-
 /* Deref the hlist from the update side */
 static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+swevent_hlist_deref(struct swevent_htable *swhash)
 {
-	return rcu_dereference_protected(cpuctx->swevent_hlist,
-					 lockdep_is_held(&cpuctx->hlist_mutex));
+	return rcu_dereference_protected(swhash->swevent_hlist,
+					 lockdep_is_held(&swhash->hlist_mutex));
 }
 
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4499,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 	kfree(hlist);
 }
 
-static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+static void swevent_hlist_release(struct swevent_htable *swhash)
 {
-	struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
+	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
 
 	if (!hlist)
 		return;
 
-	rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+	rcu_assign_pointer(swhash->swevent_hlist, NULL);
 	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
 
 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
-	mutex_lock(&cpuctx->hlist_mutex);
+	mutex_lock(&swhash->hlist_mutex);
 
-	if (!--cpuctx->hlist_refcount)
-		swevent_hlist_release(cpuctx);
+	if (!--swhash->hlist_refcount)
+		swevent_hlist_release(swhash);
 
-	mutex_unlock(&cpuctx->hlist_mutex);
+	mutex_unlock(&swhash->hlist_mutex);
 }
 
 static void swevent_hlist_put(struct perf_event *event)
@@ -4537,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event)
 
 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 	int err = 0;
 
-	mutex_lock(&cpuctx->hlist_mutex);
+	mutex_lock(&swhash->hlist_mutex);
 
-	if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4550,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 			err = -ENOMEM;
 			goto exit;
 		}
-		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 	}
-	cpuctx->hlist_refcount++;
- exit:
-	mutex_unlock(&cpuctx->hlist_mutex);
+	swhash->hlist_refcount++;
+exit:
+	mutex_unlock(&swhash->hlist_mutex);
 
 	return err;
 }
@@ -4578,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event)
 	put_online_cpus();
 
 	return 0;
- fail:
+fail:
 	for_each_possible_cpu(cpu) {
 		if (cpu == failed_cpu)
 			break;
@@ -4589,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event)
 	return err;
 }
 
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+	u64 event_id = event->attr.config;
+
+	WARN_ON(event->parent);
+
+	jump_label_dec(&perf_swevent_enabled[event_id]);
+	swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+	int event_id = event->attr.config;
+
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	switch (event_id) {
+	case PERF_COUNT_SW_CPU_CLOCK:
+	case PERF_COUNT_SW_TASK_CLOCK:
+		return -ENOENT;
 
-static const struct pmu perf_ops_tracepoint = {
-	.enable		= perf_trace_enable,
-	.disable	= perf_trace_disable,
-	.start		= perf_swevent_int,
-	.stop		= perf_swevent_void,
+	default:
+		break;
+	}
+
+	if (event_id > PERF_COUNT_SW_MAX)
+		return -ENOENT;
+
+	if (!event->parent) {
+		int err;
+
+		err = swevent_hlist_get(event);
+		if (err)
+			return err;
+
+		jump_label_inc(&perf_swevent_enabled[event_id]);
+		event->destroy = sw_perf_event_destroy;
+	}
+
+	return 0;
+}
+
+static struct pmu perf_swevent = {
+	.task_ctx_nr	= perf_sw_context,
+
+	.event_init	= perf_swevent_init,
+	.add		= perf_swevent_add,
+	.del		= perf_swevent_del,
+	.start		= perf_swevent_start,
+	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
-	.unthrottle	= perf_swevent_void,
 };
 
+#ifdef CONFIG_EVENT_TRACING
+
 static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
@@ -4643,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 
 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
-			perf_swevent_add(event, count, 1, &data, regs);
+			perf_swevent_event(event, count, 1, &data, regs);
 	}
 
 	perf_swevent_put_recursion_context(rctx);
@@ -4655,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
 	perf_trace_destroy(event);
 }
 
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
 {
 	int err;
 
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -ENOENT;
+
 	/*
 	 * Raw tracepoint data is a severe data leak, only allow root to
 	 * have these.
@@ -4666,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
 			perf_paranoid_tracepoint_raw() &&
 			!capable(CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
+		return -EPERM;
 
 	err = perf_trace_init(event);
 	if (err)
-		return NULL;
+		return err;
 
 	event->destroy = tp_perf_event_destroy;
 
-	return &perf_ops_tracepoint;
+	return 0;
+}
+
+static struct pmu perf_tracepoint = {
+	.task_ctx_nr	= perf_sw_context,
+
+	.event_init	= perf_tp_event_init,
+	.add		= perf_trace_add,
+	.del		= perf_trace_del,
+	.start		= perf_swevent_start,
+	.stop		= perf_swevent_stop,
+	.read		= perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+	perf_pmu_register(&perf_tracepoint);
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4702,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #else
 
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
 {
-	return NULL;
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4719,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event)
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+void perf_bp_event(struct perf_event *bp, void *data)
 {
-	release_bp_slot(event);
+	struct perf_sample_data sample;
+	struct pt_regs *regs = data;
+
+	perf_sample_data_init(&sample, bp->attr.bp_addr);
+
+	if (!bp->hw.state && !perf_exclude_event(bp, regs))
+		perf_swevent_event(bp, 1, 1, &sample, regs);
 }
+#endif
 
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 {
-	int err;
+	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
+	struct pt_regs *regs;
+	struct perf_event *event;
+	u64 period;
 
-	err = register_perf_hw_breakpoint(bp);
-	if (err)
-		return ERR_PTR(err);
+	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+	event->pmu->read(event);
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+	regs = get_irq_regs();
+
+	if (regs && !perf_exclude_event(event, regs)) {
+		if (!(event->attr.exclude_idle && current->pid == 0))
+			if (perf_event_overflow(event, 0, &data, regs))
+				ret = HRTIMER_NORESTART;
+	}
+
+	period = max_t(u64, 10000, event->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
-	bp->destroy = bp_perf_event_destroy;
+	return ret;
+}
 
-	return &perf_ops_bp;
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+	if (hwc->sample_period) {
+		s64 period = local64_read(&hwc->period_left);
+
+		if (period) {
+			if (period < 0)
+				period = 10000;
+
+			local64_set(&hwc->period_left, 0);
+		} else {
+			period = max_t(u64, 10000, hwc->sample_period);
+		}
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(period), 0,
+				HRTIMER_MODE_REL_PINNED, 0);
+	}
 }
 
-void perf_bp_event(struct perf_event *bp, void *data)
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 {
-	struct perf_sample_data sample;
-	struct pt_regs *regs = data;
+	struct hw_perf_event *hwc = &event->hw;
 
-	perf_sample_data_init(&sample, bp->attr.bp_addr);
+	if (hwc->sample_period) {
+		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+		local64_set(&hwc->period_left, ktime_to_ns(remaining));
 
-	if (!perf_exclude_event(bp, regs))
-		perf_swevent_add(bp, 1, 1, &sample, regs);
+		hrtimer_cancel(&hwc->hrtimer);
+	}
 }
-#else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
 {
-	return NULL;
+	s64 prev;
+	u64 now;
+
+	now = local_clock();
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
 }
 
-void perf_bp_event(struct perf_event *bp, void *regs)
+static void cpu_clock_event_start(struct perf_event *event, int flags)
 {
+	local64_set(&event->hw.prev_count, local_clock());
+	perf_swevent_start_hrtimer(event);
 }
-#endif
 
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+	perf_swevent_cancel_hrtimer(event);
+	cpu_clock_event_update(event);
+}
 
-static void sw_perf_event_destroy(struct perf_event *event)
+static int cpu_clock_event_add(struct perf_event *event, int flags)
 {
-	u64 event_id = event->attr.config;
+	if (flags & PERF_EF_START)
+		cpu_clock_event_start(event, flags);
 
-	WARN_ON(event->parent);
+	return 0;
+}
 
-	atomic_dec(&perf_swevent_enabled[event_id]);
-	swevent_hlist_put(event);
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+	cpu_clock_event_stop(event, flags);
 }
 
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+static void cpu_clock_event_read(struct perf_event *event)
 {
-	const struct pmu *pmu = NULL;
-	u64 event_id = event->attr.config;
+	cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+		return -ENOENT;
+
+	return 0;
+}
 
+static struct pmu perf_cpu_clock = {
+	.task_ctx_nr	= perf_sw_context,
+
+	.event_init	= cpu_clock_event_init,
+	.add		= cpu_clock_event_add,
+	.del		= cpu_clock_event_del,
+	.start		= cpu_clock_event_start,
+	.stop		= cpu_clock_event_stop,
+	.read		= cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+	u64 prev;
+	s64 delta;
+
+	prev = local64_xchg(&event->hw.prev_count, now);
+	delta = now - prev;
+	local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+	local64_set(&event->hw.prev_count, event->ctx->time);
+	perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+	perf_swevent_cancel_hrtimer(event);
+	task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		task_clock_event_start(event, flags);
+
+	return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+	task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+	u64 time;
+
+	if (!in_nmi()) {
+		update_context_time(event->ctx);
+		time = event->ctx->time;
+	} else {
+		u64 now = perf_clock();
+		u64 delta = now - event->ctx->timestamp;
+		time = event->ctx->time + delta;
+	}
+
+	task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+		return -ENOENT;
+
+	return 0;
+}
+
+static struct pmu perf_task_clock = {
+	.task_ctx_nr	= perf_sw_context,
+
+	.event_init	= task_clock_event_init,
+	.add		= task_clock_event_add,
+	.del		= task_clock_event_del,
+	.start		= task_clock_event_start,
+	.stop		= task_clock_event_stop,
+	.read		= task_clock_event_read,
+};
+
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+	return 0;
+}
+
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+	perf_pmu_disable(pmu);
+}
+
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+	perf_pmu_enable(pmu);
+	return 0;
+}
+
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+	perf_pmu_enable(pmu);
+}
+
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static void *find_pmu_context(int ctxn)
+{
+	struct pmu *pmu;
+
+	if (ctxn < 0)
+		return NULL;
+
+	list_for_each_entry(pmu, &pmus, entry) {
+		if (pmu->task_ctx_nr == ctxn)
+			return pmu->pmu_cpu_context;
+	}
+
+	return NULL;
+}
+
+static void free_pmu_context(void * __percpu cpu_context)
+{
+	struct pmu *pmu;
+
+	mutex_lock(&pmus_lock);
 	/*
-	 * Software events (currently) can't in general distinguish
-	 * between user, kernel and hypervisor events.
-	 * However, context switches and cpu migrations are considered
-	 * to be kernel events, and page faults are never hypervisor
-	 * events.
+	 * Like a real lame refcount.
 	 */
-	switch (event_id) {
-	case PERF_COUNT_SW_CPU_CLOCK:
-		pmu = &perf_ops_cpu_clock;
+	list_for_each_entry(pmu, &pmus, entry) {
+		if (pmu->pmu_cpu_context == cpu_context)
+			goto out;
+	}
 
-		break;
-	case PERF_COUNT_SW_TASK_CLOCK:
-		/*
-		 * If the user instantiates this as a per-cpu event,
-		 * use the cpu_clock event instead.
-		 */
-		if (event->ctx->task)
-			pmu = &perf_ops_task_clock;
-		else
-			pmu = &perf_ops_cpu_clock;
+	free_percpu(cpu_context);
+out:
+	mutex_unlock(&pmus_lock);
+}
 
-		break;
-	case PERF_COUNT_SW_PAGE_FAULTS:
-	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_SW_CONTEXT_SWITCHES:
-	case PERF_COUNT_SW_CPU_MIGRATIONS:
-	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-	case PERF_COUNT_SW_EMULATION_FAULTS:
-		if (!event->parent) {
-			int err;
-
-			err = swevent_hlist_get(event);
-			if (err)
-				return ERR_PTR(err);
+int perf_pmu_register(struct pmu *pmu)
+{
+	int cpu, ret;
+
+	mutex_lock(&pmus_lock);
+	ret = -ENOMEM;
+	pmu->pmu_disable_count = alloc_percpu(int);
+	if (!pmu->pmu_disable_count)
+		goto unlock;
 
-			atomic_inc(&perf_swevent_enabled[event_id]);
-			event->destroy = sw_perf_event_destroy;
+	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+	if (pmu->pmu_cpu_context)
+		goto got_cpu_context;
+
+	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+	if (!pmu->pmu_cpu_context)
+		goto free_pdc;
+
+	for_each_possible_cpu(cpu) {
+		struct perf_cpu_context *cpuctx;
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		__perf_event_init_context(&cpuctx->ctx);
+		cpuctx->ctx.type = cpu_context;
+		cpuctx->ctx.pmu = pmu;
+		cpuctx->jiffies_interval = 1;
+		INIT_LIST_HEAD(&cpuctx->rotation_list);
+	}
+
+got_cpu_context:
+	if (!pmu->start_txn) {
+		if (pmu->pmu_enable) {
+			/*
+			 * If we have pmu_enable/pmu_disable calls, install
+			 * transaction stubs that use that to try and batch
+			 * hardware accesses.
+			 */
+			pmu->start_txn  = perf_pmu_start_txn;
+			pmu->commit_txn = perf_pmu_commit_txn;
+			pmu->cancel_txn = perf_pmu_cancel_txn;
+		} else {
+			pmu->start_txn  = perf_pmu_nop_void;
+			pmu->commit_txn = perf_pmu_nop_int;
+			pmu->cancel_txn = perf_pmu_nop_void;
+		}
+	}
+
+	if (!pmu->pmu_enable) {
+		pmu->pmu_enable  = perf_pmu_nop_void;
+		pmu->pmu_disable = perf_pmu_nop_void;
+	}
+
+	list_add_rcu(&pmu->entry, &pmus);
+	ret = 0;
+unlock:
+	mutex_unlock(&pmus_lock);
+
+	return ret;
+
+free_pdc:
+	free_percpu(pmu->pmu_disable_count);
+	goto unlock;
+}
+
+void perf_pmu_unregister(struct pmu *pmu)
+{
+	mutex_lock(&pmus_lock);
+	list_del_rcu(&pmu->entry);
+	mutex_unlock(&pmus_lock);
+
+	/*
+	 * We dereference the pmu list under both SRCU and regular RCU, so
+	 * synchronize against both of those.
+	 */
+	synchronize_srcu(&pmus_srcu);
+	synchronize_rcu();
+
+	free_percpu(pmu->pmu_disable_count);
+	free_pmu_context(pmu->pmu_cpu_context);
+}
+
+struct pmu *perf_init_event(struct perf_event *event)
+{
+	struct pmu *pmu = NULL;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		int ret = pmu->event_init(event);
+		if (!ret)
+			goto unlock;
+
+		if (ret != -ENOENT) {
+			pmu = ERR_PTR(ret);
+			goto unlock;
 		}
-		pmu = &perf_ops_generic;
-		break;
 	}
+	pmu = ERR_PTR(-ENOENT);
+unlock:
+	srcu_read_unlock(&pmus_srcu, idx);
 
 	return pmu;
 }
@@ -4826,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
  * Allocate and initialize a event structure
  */
 static struct perf_event *
-perf_event_alloc(struct perf_event_attr *attr,
-		   int cpu,
-		   struct perf_event_context *ctx,
-		   struct perf_event *group_leader,
-		   struct perf_event *parent_event,
-		   perf_overflow_handler_t overflow_handler,
-		   gfp_t gfpflags)
-{
-	const struct pmu *pmu;
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
+		 struct task_struct *task,
+		 struct perf_event *group_leader,
+		 struct perf_event *parent_event,
+		 perf_overflow_handler_t overflow_handler)
+{
+	struct pmu *pmu;
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
 	long err;
 
-	event = kzalloc(sizeof(*event), gfpflags);
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
 	if (!event)
 		return ERR_PTR(-ENOMEM);
 
@@ -4857,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
 	init_waitqueue_head(&event->waitq);
+	init_irq_work(&event->pending, perf_pending_event);
 
 	mutex_init(&event->mmap_mutex);
 
@@ -4864,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr,
 	event->attr		= *attr;
 	event->group_leader	= group_leader;
 	event->pmu		= NULL;
-	event->ctx		= ctx;
 	event->oncpu		= -1;
 
 	event->parent		= parent_event;
@@ -4874,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (task) {
+		event->attach_state = PERF_ATTACH_TASK;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+		/*
+		 * hw_breakpoint is a bit difficult here..
+		 */
+		if (attr->type == PERF_TYPE_BREAKPOINT)
+			event->hw.bp_target = task;
+#endif
+	}
+
 	if (!overflow_handler && parent_event)
 		overflow_handler = parent_event->overflow_handler;
 	
@@ -4898,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr,
 	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto done;
 
-	switch (attr->type) {
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-	case PERF_TYPE_HW_CACHE:
-		pmu = hw_perf_event_init(event);
-		break;
-
-	case PERF_TYPE_SOFTWARE:
-		pmu = sw_perf_event_init(event);
-		break;
-
-	case PERF_TYPE_TRACEPOINT:
-		pmu = tp_perf_event_init(event);
-		break;
+	pmu = perf_init_event(event);
 
-	case PERF_TYPE_BREAKPOINT:
-		pmu = bp_perf_event_init(event);
-		break;
-
-
-	default:
-		break;
-	}
 done:
 	err = 0;
 	if (!pmu)
@@ -4938,13 +5345,21 @@ done:
 	event->pmu = pmu;
 
 	if (!event->parent) {
-		atomic_inc(&nr_events);
+		if (event->attach_state & PERF_ATTACH_TASK)
+			jump_label_inc(&perf_task_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_inc(&nr_comm_events);
 		if (event->attr.task)
 			atomic_inc(&nr_task_events);
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+			err = get_callchain_buffers();
+			if (err) {
+				free_event(event);
+				return ERR_PTR(err);
+			}
+		}
 	}
 
 	return event;
@@ -5092,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open,
 		struct perf_event_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-	struct perf_event *event, *group_leader = NULL, *output_event = NULL;
+	struct perf_event *group_leader = NULL, *output_event = NULL;
+	struct perf_event *event, *sibling;
 	struct perf_event_attr attr;
 	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
 	struct file *group_file = NULL;
+	struct task_struct *task = NULL;
+	struct pmu *pmu;
 	int event_fd;
+	int move_group = 0;
 	int fput_needed = 0;
 	int err;
 
@@ -5123,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (event_fd < 0)
 		return event_fd;
 
-	/*
-	 * Get the target context (task or percpu):
-	 */
-	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx)) {
-		err = PTR_ERR(ctx);
-		goto err_fd;
-	}
-
 	if (group_fd != -1) {
 		group_leader = perf_fget_light(group_fd, &fput_needed);
 		if (IS_ERR(group_leader)) {
 			err = PTR_ERR(group_leader);
-			goto err_put_context;
+			goto err_fd;
 		}
 		group_file = group_leader->filp;
 		if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5145,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
+	if (pid != -1) {
+		task = find_lively_task_by_vpid(pid);
+		if (IS_ERR(task)) {
+			err = PTR_ERR(task);
+			goto err_group_fd;
+		}
+	}
+
+	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+	if (IS_ERR(event)) {
+		err = PTR_ERR(event);
+		goto err_task;
+	}
+
+	/*
+	 * Special case software events and allow them to be part of
+	 * any hardware group.
+	 */
+	pmu = event->pmu;
+
+	if (group_leader &&
+	    (is_software_event(event) != is_software_event(group_leader))) {
+		if (is_software_event(event)) {
+			/*
+			 * If event and group_leader are not both a software
+			 * event, and event is, then group leader is not.
+			 *
+			 * Allow the addition of software events to !software
+			 * groups, this is safe because software events never
+			 * fail to schedule.
+			 */
+			pmu = group_leader->pmu;
+		} else if (is_software_event(group_leader) &&
+			   (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+			/*
+			 * In case the group is a pure software group, and we
+			 * try to add a hardware event, move the whole group to
+			 * the hardware context.
+			 */
+			move_group = 1;
+		}
+	}
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
+	ctx = find_get_context(pmu, task, cpu);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto err_alloc;
+	}
+
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
@@ -5156,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * becoming part of another group-sibling):
 		 */
 		if (group_leader->group_leader != group_leader)
-			goto err_put_context;
+			goto err_context;
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
 		 */
-		if (group_leader->ctx != ctx)
-			goto err_put_context;
+		if (move_group) {
+			if (group_leader->ctx->type != ctx->type)
+				goto err_context;
+		} else {
+			if (group_leader->ctx != ctx)
+				goto err_context;
+		}
+
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
 		if (attr.exclusive || attr.pinned)
-			goto err_put_context;
-	}
-
-	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-				     NULL, NULL, GFP_KERNEL);
-	if (IS_ERR(event)) {
-		err = PTR_ERR(event);
-		goto err_put_context;
+			goto err_context;
 	}
 
 	if (output_event) {
 		err = perf_event_set_output(event, output_event);
 		if (err)
-			goto err_free_put_context;
+			goto err_context;
 	}
 
 	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
 	if (IS_ERR(event_file)) {
 		err = PTR_ERR(event_file);
-		goto err_free_put_context;
+		goto err_context;
+	}
+
+	if (move_group) {
+		struct perf_event_context *gctx = group_leader->ctx;
+
+		mutex_lock(&gctx->mutex);
+		perf_event_remove_from_context(group_leader);
+		list_for_each_entry(sibling, &group_leader->sibling_list,
+				    group_entry) {
+			perf_event_remove_from_context(sibling);
+			put_ctx(gctx);
+		}
+		mutex_unlock(&gctx->mutex);
+		put_ctx(gctx);
 	}
 
 	event->filp = event_file;
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
+
+	if (move_group) {
+		perf_install_in_context(ctx, group_leader, cpu);
+		get_ctx(ctx);
+		list_for_each_entry(sibling, &group_leader->sibling_list,
+				    group_entry) {
+			perf_install_in_context(ctx, sibling, cpu);
+			get_ctx(ctx);
+		}
+	}
+
 	perf_install_in_context(ctx, event, cpu);
 	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
@@ -5212,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open,
 	fd_install(event_fd, event_file);
 	return event_fd;
 
-err_free_put_context:
+err_context:
+	put_ctx(ctx);
+err_alloc:
 	free_event(event);
-err_put_context:
+err_task:
+	if (task)
+		put_task_struct(task);
+err_group_fd:
 	fput_light(group_file, fput_needed);
-	put_ctx(ctx);
 err_fd:
 	put_unused_fd(event_fd);
 	return err;
@@ -5227,32 +5717,31 @@ err_fd:
  *
  * @attr: attributes of the counter to create
  * @cpu: cpu in which the counter is bound
- * @pid: task to profile
+ * @task: task to profile (NULL for percpu)
  */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-				 pid_t pid,
+				 struct task_struct *task,
 				 perf_overflow_handler_t overflow_handler)
 {
-	struct perf_event *event;
 	struct perf_event_context *ctx;
+	struct perf_event *event;
 	int err;
 
 	/*
 	 * Get the target context (task or percpu):
 	 */
 
-	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx)) {
-		err = PTR_ERR(ctx);
-		goto err_exit;
-	}
-
-	event = perf_event_alloc(attr, cpu, ctx, NULL,
-				 NULL, overflow_handler, GFP_KERNEL);
+	event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
-		goto err_put_context;
+		goto err;
+	}
+
+	ctx = find_get_context(event->pmu, task, cpu);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto err_free;
 	}
 
 	event->filp = NULL;
@@ -5270,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
- err_put_context:
-	put_ctx(ctx);
- err_exit:
+err_free:
+	free_event(event);
+err:
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 
-/*
- * inherit a event from parent task to child task:
- */
-static struct perf_event *
-inherit_event(struct perf_event *parent_event,
-	      struct task_struct *parent,
-	      struct perf_event_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_event *group_leader,
-	      struct perf_event_context *child_ctx)
-{
-	struct perf_event *child_event;
-
-	/*
-	 * Instead of creating recursive hierarchies of events,
-	 * we link inherited events back to the original parent,
-	 * which has a filp for sure, which we use as the reference
-	 * count:
-	 */
-	if (parent_event->parent)
-		parent_event = parent_event->parent;
-
-	child_event = perf_event_alloc(&parent_event->attr,
-					   parent_event->cpu, child_ctx,
-					   group_leader, parent_event,
-					   NULL, GFP_KERNEL);
-	if (IS_ERR(child_event))
-		return child_event;
-	get_ctx(child_ctx);
-
-	/*
-	 * Make the child state follow the state of the parent event,
-	 * not its attr.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_event_{en, dis}able_family.
-	 */
-	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
-		child_event->state = PERF_EVENT_STATE_INACTIVE;
-	else
-		child_event->state = PERF_EVENT_STATE_OFF;
-
-	if (parent_event->attr.freq) {
-		u64 sample_period = parent_event->hw.sample_period;
-		struct hw_perf_event *hwc = &child_event->hw;
-
-		hwc->sample_period = sample_period;
-		hwc->last_period   = sample_period;
-
-		local64_set(&hwc->period_left, sample_period);
-	}
-
-	child_event->overflow_handler = parent_event->overflow_handler;
-
-	/*
-	 * Link it up in the child's context:
-	 */
-	add_event_to_ctx(child_event, child_ctx);
-
-	/*
-	 * Get a reference to the parent filp - we will fput it
-	 * when the child event exits. This is safe to do because
-	 * we are in the parent and we know that the filp still
-	 * exists and has a nonzero count:
-	 */
-	atomic_long_inc(&parent_event->filp->f_count);
-
-	/*
-	 * Link this into the parent event's child list
-	 */
-	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-	mutex_lock(&parent_event->child_mutex);
-	list_add_tail(&child_event->child_list, &parent_event->child_list);
-	mutex_unlock(&parent_event->child_mutex);
-
-	return child_event;
-}
-
-static int inherit_group(struct perf_event *parent_event,
-	      struct task_struct *parent,
-	      struct perf_event_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_event_context *child_ctx)
-{
-	struct perf_event *leader;
-	struct perf_event *sub;
-	struct perf_event *child_ctr;
-
-	leader = inherit_event(parent_event, parent, parent_ctx,
-				 child, NULL, child_ctx);
-	if (IS_ERR(leader))
-		return PTR_ERR(leader);
-	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
-		child_ctr = inherit_event(sub, parent, parent_ctx,
-					    child, leader, child_ctx);
-		if (IS_ERR(child_ctr))
-			return PTR_ERR(child_ctr);
-	}
-	return 0;
-}
-
 static void sync_child_event(struct perf_event *child_event,
 			       struct task_struct *child)
 {
@@ -5432,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event,
 	}
 }
 
-/*
- * When a child task exits, feed back event values to parent events.
- */
-void perf_event_exit_task(struct task_struct *child)
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
 	struct perf_event *child_event, *tmp;
 	struct perf_event_context *child_ctx;
 	unsigned long flags;
 
-	if (likely(!child->perf_event_ctxp)) {
+	if (likely(!child->perf_event_ctxp[ctxn])) {
 		perf_event_task(child, NULL, 0);
 		return;
 	}
@@ -5453,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child)
 	 * scheduled, so we are now safe from rescheduling changing
 	 * our context.
 	 */
-	child_ctx = child->perf_event_ctxp;
-	__perf_event_task_sched_out(child_ctx);
+	child_ctx = child->perf_event_ctxp[ctxn];
+	task_ctx_sched_out(child_ctx, EVENT_ALL);
 
 	/*
 	 * Take the context lock here so that if find_get_context is
@@ -5462,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child)
 	 * incremented the context's refcount before we do put_ctx below.
 	 */
 	raw_spin_lock(&child_ctx->lock);
-	child->perf_event_ctxp = NULL;
+	child->perf_event_ctxp[ctxn] = NULL;
 	/*
 	 * If this context is a clone; unclone it so it can't get
 	 * swapped to another process while we're removing all
@@ -5515,6 +5902,17 @@ again:
 	put_ctx(child_ctx);
 }
 
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+	int ctxn;
+
+	for_each_task_context_nr(ctxn)
+		perf_event_exit_task_context(child, ctxn);
+}
+
 static void perf_free_event(struct perf_event *event,
 			    struct perf_event_context *ctx)
 {
@@ -5536,48 +5934,166 @@ static void perf_free_event(struct perf_event *event,
 
 /*
  * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
+ * perf_event_init_task below, used by fork() in case of fail.
  */
 void perf_event_free_task(struct task_struct *task)
 {
-	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event_context *ctx;
 	struct perf_event *event, *tmp;
+	int ctxn;
 
-	if (!ctx)
-		return;
+	for_each_task_context_nr(ctxn) {
+		ctx = task->perf_event_ctxp[ctxn];
+		if (!ctx)
+			continue;
 
-	mutex_lock(&ctx->mutex);
+		mutex_lock(&ctx->mutex);
 again:
-	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-		perf_free_event(event, ctx);
+		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
+				group_entry)
+			perf_free_event(event, ctx);
 
-	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-				 group_entry)
-		perf_free_event(event, ctx);
+		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+				group_entry)
+			perf_free_event(event, ctx);
 
-	if (!list_empty(&ctx->pinned_groups) ||
-	    !list_empty(&ctx->flexible_groups))
-		goto again;
+		if (!list_empty(&ctx->pinned_groups) ||
+				!list_empty(&ctx->flexible_groups))
+			goto again;
 
-	mutex_unlock(&ctx->mutex);
+		mutex_unlock(&ctx->mutex);
 
-	put_ctx(ctx);
+		put_ctx(ctx);
+	}
+}
+
+void perf_event_delayed_put(struct task_struct *task)
+{
+	int ctxn;
+
+	for_each_task_context_nr(ctxn)
+		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event *group_leader,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *child_event;
+	unsigned long flags;
+
+	/*
+	 * Instead of creating recursive hierarchies of events,
+	 * we link inherited events back to the original parent,
+	 * which has a filp for sure, which we use as the reference
+	 * count:
+	 */
+	if (parent_event->parent)
+		parent_event = parent_event->parent;
+
+	child_event = perf_event_alloc(&parent_event->attr,
+					   parent_event->cpu,
+					   child,
+					   group_leader, parent_event,
+					   NULL);
+	if (IS_ERR(child_event))
+		return child_event;
+	get_ctx(child_ctx);
+
+	/*
+	 * Make the child state follow the state of the parent event,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_event_{en, dis}able_family.
+	 */
+	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+		child_event->state = PERF_EVENT_STATE_INACTIVE;
+	else
+		child_event->state = PERF_EVENT_STATE_OFF;
+
+	if (parent_event->attr.freq) {
+		u64 sample_period = parent_event->hw.sample_period;
+		struct hw_perf_event *hwc = &child_event->hw;
+
+		hwc->sample_period = sample_period;
+		hwc->last_period   = sample_period;
+
+		local64_set(&hwc->period_left, sample_period);
+	}
+
+	child_event->ctx = child_ctx;
+	child_event->overflow_handler = parent_event->overflow_handler;
+
+	/*
+	 * Link it up in the child's context:
+	 */
+	raw_spin_lock_irqsave(&child_ctx->lock, flags);
+	add_event_to_ctx(child_event, child_ctx);
+	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+	/*
+	 * Get a reference to the parent filp - we will fput it
+	 * when the child event exits. This is safe to do because
+	 * we are in the parent and we know that the filp still
+	 * exists and has a nonzero count:
+	 */
+	atomic_long_inc(&parent_event->filp->f_count);
+
+	/*
+	 * Link this into the parent event's child list
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_add_tail(&child_event->child_list, &parent_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *leader;
+	struct perf_event *sub;
+	struct perf_event *child_ctr;
+
+	leader = inherit_event(parent_event, parent, parent_ctx,
+				 child, NULL, child_ctx);
+	if (IS_ERR(leader))
+		return PTR_ERR(leader);
+	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+		child_ctr = inherit_event(sub, parent, parent_ctx,
+					    child, leader, child_ctx);
+		if (IS_ERR(child_ctr))
+			return PTR_ERR(child_ctr);
+	}
+	return 0;
 }
 
 static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		   struct perf_event_context *parent_ctx,
-		   struct task_struct *child,
+		   struct task_struct *child, int ctxn,
 		   int *inherited_all)
 {
 	int ret;
-	struct perf_event_context *child_ctx = child->perf_event_ctxp;
+	struct perf_event_context *child_ctx;
 
 	if (!event->attr.inherit) {
 		*inherited_all = 0;
 		return 0;
 	}
 
+       	child_ctx = child->perf_event_ctxp[ctxn];
 	if (!child_ctx) {
 		/*
 		 * This is executed from the parent task context, so
@@ -5586,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		 * child.
 		 */
 
-		child_ctx = kzalloc(sizeof(struct perf_event_context),
-				    GFP_KERNEL);
+		child_ctx = alloc_perf_context(event->pmu, child);
 		if (!child_ctx)
 			return -ENOMEM;
 
-		__perf_event_init_context(child_ctx, child);
-		child->perf_event_ctxp = child_ctx;
-		get_task_struct(child);
+		child->perf_event_ctxp[ctxn] = child_ctx;
 	}
 
 	ret = inherit_group(event, parent, parent_ctx,
@@ -5605,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 	return ret;
 }
 
-
 /*
  * Initialize the perf_event context in task_struct
  */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_context(struct task_struct *child, int ctxn)
 {
 	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
@@ -5618,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child)
 	int inherited_all = 1;
 	int ret = 0;
 
-	child->perf_event_ctxp = NULL;
+	child->perf_event_ctxp[ctxn] = NULL;
 
 	mutex_init(&child->perf_event_mutex);
 	INIT_LIST_HEAD(&child->perf_event_list);
 
-	if (likely(!parent->perf_event_ctxp))
+	if (likely(!parent->perf_event_ctxp[ctxn]))
 		return 0;
 
 	/*
 	 * If the parent's context is a clone, pin it so it won't get
 	 * swapped under us.
 	 */
-	parent_ctx = perf_pin_task_context(parent);
+	parent_ctx = perf_pin_task_context(parent, ctxn);
 
 	/*
 	 * No need to check if parent_ctx != NULL here; since we saw
@@ -5650,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child)
 	 * the list, not manipulating it:
 	 */
 	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx, child,
-					 &inherited_all);
+		ret = inherit_task_group(event, parent, parent_ctx,
+					 child, ctxn, &inherited_all);
 		if (ret)
 			break;
 	}
 
 	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx, child,
-					 &inherited_all);
+		ret = inherit_task_group(event, parent, parent_ctx,
+					 child, ctxn, &inherited_all);
 		if (ret)
 			break;
 	}
 
-	child_ctx = child->perf_event_ctxp;
+	child_ctx = child->perf_event_ctxp[ctxn];
 
 	if (child_ctx && inherited_all) {
 		/*
@@ -5692,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child)
 	return ret;
 }
 
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+	int ctxn, ret;
+
+	for_each_task_context_nr(ctxn) {
+		ret = perf_event_init_context(child, ctxn);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static void __init perf_event_init_all_cpus(void)
 {
+	struct swevent_htable *swhash;
 	int cpu;
-	struct perf_cpu_context *cpuctx;
 
 	for_each_possible_cpu(cpu) {
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		mutex_init(&cpuctx->hlist_mutex);
-		__perf_event_init_context(&cpuctx->ctx, NULL);
+		swhash = &per_cpu(swevent_htable, cpu);
+		mutex_init(&swhash->hlist_mutex);
+		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
 	}
 }
 
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
-	struct perf_cpu_context *cpuctx;
-
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
-	spin_lock(&perf_resource_lock);
-	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
-	spin_unlock(&perf_resource_lock);
-
-	mutex_lock(&cpuctx->hlist_mutex);
-	if (cpuctx->hlist_refcount > 0) {
+	mutex_lock(&swhash->hlist_mutex);
+	if (swhash->hlist_refcount > 0) {
 		struct swevent_hlist *hlist;
 
-		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
-		WARN_ON_ONCE(!hlist);
-		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+		WARN_ON(!hlist);
+		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 	}
-	mutex_unlock(&cpuctx->hlist_mutex);
+	mutex_unlock(&swhash->hlist_mutex);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_event_exit_cpu(void *info)
+static void perf_pmu_rotate_stop(struct pmu *pmu)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_event_context *ctx = &cpuctx->ctx;
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+	WARN_ON(!irqs_disabled());
+
+	list_del_init(&cpuctx->rotation_list);
+}
+
+static void __perf_event_exit_context(void *__info)
+{
+	struct perf_event_context *ctx = __info;
 	struct perf_event *event, *tmp;
 
+	perf_pmu_rotate_stop(ctx->pmu);
+
 	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
 		__perf_event_remove_from_context(event);
 	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
 		__perf_event_remove_from_context(event);
 }
+
+static void perf_event_exit_cpu_context(int cpu)
+{
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+
+		mutex_lock(&ctx->mutex);
+		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+		mutex_unlock(&ctx->mutex);
+	}
+	srcu_read_unlock(&pmus_srcu, idx);
+}
+
 static void perf_event_exit_cpu(int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_event_context *ctx = &cpuctx->ctx;
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
-	mutex_lock(&cpuctx->hlist_mutex);
-	swevent_hlist_release(cpuctx);
-	mutex_unlock(&cpuctx->hlist_mutex);
+	mutex_lock(&swhash->hlist_mutex);
+	swevent_hlist_release(swhash);
+	mutex_unlock(&swhash->hlist_mutex);
 
-	mutex_lock(&ctx->mutex);
-	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-	mutex_unlock(&ctx->mutex);
+	perf_event_exit_cpu_context(cpu);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
@@ -5778,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-	.notifier_call		= perf_cpu_notify,
-	.priority		= 20,
-};
-
 void __init perf_event_init(void)
 {
 	perf_event_init_all_cpus();
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-			(void *)(long)smp_processor_id());
-	register_cpu_notifier(&perf_cpu_nb);
-}
-
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
-					struct sysdev_class_attribute *attr,
-					char *buf)
-{
-	return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-			struct sysdev_class_attribute *attr,
-			const char *buf,
-			size_t count)
-{
-	struct perf_cpu_context *cpuctx;
-	unsigned long val;
-	int err, cpu, mpt;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > perf_max_events)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_reserved_percpu = val;
-	for_each_online_cpu(cpu) {
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		raw_spin_lock_irq(&cpuctx->ctx.lock);
-		mpt = min(perf_max_events - cpuctx->ctx.nr_events,
-			  perf_max_events - perf_reserved_percpu);
-		cpuctx->max_pertask = mpt;
-		raw_spin_unlock_irq(&cpuctx->ctx.lock);
-	}
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static ssize_t perf_show_overcommit(struct sysdev_class *class,
-				    struct sysdev_class_attribute *attr,
-				    char *buf)
-{
-	return sprintf(buf, "%d\n", perf_overcommit);
-}
-
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class,
-		    struct sysdev_class_attribute *attr,
-		    const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > 1)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_overcommit = val;
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static SYSDEV_CLASS_ATTR(
-				reserve_percpu,
-				0644,
-				perf_show_reserve_percpu,
-				perf_set_reserve_percpu
-			);
-
-static SYSDEV_CLASS_ATTR(
-				overcommit,
-				0644,
-				perf_show_overcommit,
-				perf_set_overcommit
-			);
-
-static struct attribute *perfclass_attrs[] = {
-	&attr_reserve_percpu.attr,
-	&attr_overcommit.attr,
-	NULL
-};
-
-static struct attribute_group perfclass_attr_group = {
-	.attrs			= perfclass_attrs,
-	.name			= "perf_events",
-};
-
-static int __init perf_event_sysfs_init(void)
-{
-	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-				  &perfclass_attr_group);
+	init_srcu_struct(&pmus_srcu);
+	perf_pmu_register(&perf_swevent);
+	perf_pmu_register(&perf_cpu_clock);
+	perf_pmu_register(&perf_task_clock);
+	perf_tp_register();
+	perf_cpu_notifier(perf_cpu_notify);
 }
-device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d08..39b65b69584 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 	struct task_struct *result = NULL;
 	if (pid) {
 		struct hlist_node *first;
-		first = rcu_dereference_check(pid->tasks[type].first,
+		first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
 					      rcu_read_lock_held() ||
 					      lockdep_tasklist_lock_is_held());
 		if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
  */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
+	rcu_lockdep_assert(rcu_read_lock_held());
 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008..2531017795f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
  * provides serialisation for access to the entire console
  * driver system.
  */
-static DECLARE_MUTEX(console_sem);
+static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
 
@@ -556,7 +556,7 @@ static void zap_locks(void)
 	/* If a crash is occurring, make sure we can't deadlock */
 	spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
-	init_MUTEX(&console_sem);
+	sema_init(&console_sem, 1);
 }
 
 #if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb3..a23a57a976d 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 
 /**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
  *
  * Check for bottom half being disabled, which covers both the
  * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
  * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation.  This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
  *
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
  */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
 {
 	if (!debug_lockdep_rcu_enabled())
 		return 1;
-	return in_softirq();
+	return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be..d806735342a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+/* Forward declarations for rcutiny_plugin.h. */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __call_rcu(struct rcu_head *head,
+		       void (*func)(struct rcu_head *rcu),
+		       struct rcu_ctrlblk *rcp);
+
+#include "rcutiny_plugin.h"
+
 #ifdef CONFIG_NO_HZ
 
 static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
 		rcu_sched_qs(cpu);
 	else if (!in_softirq())
 		rcu_bh_qs(cpu);
+	rcu_preempt_check_callbacks();
 }
 
 /*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	*rcp->donetail = NULL;
 	if (rcp->curtail == rcp->donetail)
 		rcp->curtail = &rcp->rcucblist;
+	rcu_preempt_remove_callbacks(rcp);
 	rcp->donetail = &rcp->rcucblist;
 	local_irq_restore(flags);
 
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	__rcu_process_callbacks(&rcu_sched_ctrlblk);
 	__rcu_process_callbacks(&rcu_bh_ctrlblk);
+	rcu_preempt_process_callbacks();
 }
 
 /*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
 }
 
 /*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
  * period.  But since we have but one CPU, that would be after any
  * quiescent state.
  */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_sched_ctrlblk);
 }
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
 
 /*
  * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 
-void rcu_barrier(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
 void rcu_barrier_bh(void)
 {
 	struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
-
-#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc74..6ceca4f745f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
 /*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
  * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
  *
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
+#ifdef CONFIG_TINY_PREEMPT_RCU
+
+#include <linux/delay.h>
+
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+	struct rcu_ctrlblk rcb;	/* curtail: ->next ptr of last CB for GP. */
+	struct rcu_head **nexttail;
+				/* Tasks blocked in a preemptible RCU */
+				/*  read-side critical section while an */
+				/*  preemptible-RCU grace period is in */
+				/*  progress must wait for a later grace */
+				/*  period.  This pointer points to the */
+				/*  ->next pointer of the last task that */
+				/*  must wait for a later grace period, or */
+				/*  to &->rcb.rcucblist if there is no */
+				/*  such task. */
+	struct list_head blkd_tasks;
+				/* Tasks blocked in RCU read-side critical */
+				/*  section.  Tasks are placed at the head */
+				/*  of this list and age towards the tail. */
+	struct list_head *gp_tasks;
+				/* Pointer to the first task blocking the */
+				/*  current grace period, or NULL if there */
+				/*  is not such task. */
+	struct list_head *exp_tasks;
+				/* Pointer to first task blocking the */
+				/*  current expedited grace period, or NULL */
+				/*  if there is no such task.  If there */
+				/*  is no current expedited grace period, */
+				/*  then there cannot be any such task. */
+	u8 gpnum;		/* Current grace period. */
+	u8 gpcpu;		/* Last grace period blocked by the CPU. */
+	u8 completed;		/* Last grace period completed. */
+				/*  If all three are equal, RCU is idle. */
+};
+
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+	.rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+	.rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+	.nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+	.blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+	return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Check for a running RCU reader.  Because there is only one CPU,
+ * there can be but one running RCU reader at a time.  ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+	return current->rcu_read_lock_nesting;
+}
+
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+	return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+	return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+	return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+	return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state.  There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state.  The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time.  Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period.  (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+	/* Record both CPU and task as having responded to current GP. */
+	rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+
+	/*
+	 * If there is no GP, or if blocked readers are still blocking GP,
+	 * then there is nothing more to do.
+	 */
+	if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+		return;
+
+	/* Advance callbacks. */
+	rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+	rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+	rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+
+	/* If there are no blocked readers, next GP is done instantly. */
+	if (!rcu_preempt_blocked_readers_any())
+		rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+
+	/* If there are done callbacks, make RCU_SOFTIRQ process them. */
+	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+	if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+
+		/* Official start of GP. */
+		rcu_preempt_ctrlblk.gpnum++;
+
+		/* Any blocked RCU readers block new GP. */
+		if (rcu_preempt_blocked_readers_any())
+			rcu_preempt_ctrlblk.gp_tasks =
+				rcu_preempt_ctrlblk.blkd_tasks.next;
+
+		/* If there is no running reader, CPU is done with GP. */
+		if (!rcu_preempt_running_reader())
+			rcu_preempt_cpu_qs();
+	}
+}
+
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from.  If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section.  Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+	struct task_struct *t = current;
+	unsigned long flags;
+
+	local_irq_save(flags); /* must exclude scheduler_tick(). */
+	if (rcu_preempt_running_reader() &&
+	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+		/* Possibly blocking in an RCU read-side critical section. */
+		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+
+		/*
+		 * If this CPU has already checked in, then this task
+		 * will hold up the next grace period rather than the
+		 * current grace period.  Queue the task accordingly.
+		 * If the task is queued for the current grace period
+		 * (i.e., this CPU has not yet passed through a quiescent
+		 * state for the current grace period), then as long
+		 * as that task remains queued, the current grace period
+		 * cannot end.
+		 */
+		list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+		if (rcu_cpu_blocking_cur_gp())
+			rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+	}
+
+	/*
+	 * Either we were not in an RCU read-side critical section to
+	 * begin with, or we have now recorded that critical section
+	 * globally.  Either way, we can now note a quiescent state
+	 * for this CPU.  Again, if we were in an RCU read-side critical
+	 * section, and if that critical section was blocking the current
+	 * grace period, then the fact that the task has been enqueued
+	 * means that current grace period continues to be blocked.
+	 */
+	rcu_preempt_cpu_qs();
+	local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+	current->rcu_read_lock_nesting++;
+	barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+	int empty;
+	int empty_exp;
+	unsigned long flags;
+	struct list_head *np;
+	int special;
+
+	/*
+	 * NMI handlers cannot block and cannot safely manipulate state.
+	 * They therefore cannot possibly be special, so just leave.
+	 */
+	if (in_nmi())
+		return;
+
+	local_irq_save(flags);
+
+	/*
+	 * If RCU core is waiting for this CPU to exit critical section,
+	 * let it know that we have done so.
+	 */
+	special = t->rcu_read_unlock_special;
+	if (special & RCU_READ_UNLOCK_NEED_QS)
+		rcu_preempt_cpu_qs();
+
+	/* Hardware IRQ handlers cannot block. */
+	if (in_irq()) {
+		local_irq_restore(flags);
+		return;
+	}
+
+	/* Clean up if blocked during RCU read-side critical section. */
+	if (special & RCU_READ_UNLOCK_BLOCKED) {
+		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+		/*
+		 * Remove this task from the ->blkd_tasks list and adjust
+		 * any pointers that might have been referencing it.
+		 */
+		empty = !rcu_preempt_blocked_readers_cgp();
+		empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+		np = t->rcu_node_entry.next;
+		if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+			np = NULL;
+		list_del(&t->rcu_node_entry);
+		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+			rcu_preempt_ctrlblk.gp_tasks = np;
+		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+			rcu_preempt_ctrlblk.exp_tasks = np;
+		INIT_LIST_HEAD(&t->rcu_node_entry);
+
+		/*
+		 * If this was the last task on the current list, and if
+		 * we aren't waiting on the CPU, report the quiescent state
+		 * and start a new grace period if needed.
+		 */
+		if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+			rcu_preempt_cpu_qs();
+			rcu_preempt_start_gp();
+		}
+
+		/*
+		 * If this was the last task on the expedited lists,
+		 * then we need wake up the waiting task.
+		 */
+		if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+			rcu_report_exp_done();
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+	struct task_struct *t = current;
+
+	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+	--t->rcu_read_lock_nesting;
+	barrier();  /* decrement before load of ->rcu_read_unlock_special */
+	if (t->rcu_read_lock_nesting == 0 &&
+	    unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+		rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+	WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * Check for a quiescent state from the current CPU.  When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere.  This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+	struct task_struct *t = current;
+
+	if (rcu_preempt_gp_in_progress() &&
+	    (!rcu_preempt_running_reader() ||
+	     !rcu_cpu_blocking_cur_gp()))
+		rcu_preempt_cpu_qs();
+	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+	    rcu_preempt_ctrlblk.rcb.donetail)
+		raise_softirq(RCU_SOFTIRQ);
+	if (rcu_preempt_gp_in_progress() &&
+	    rcu_cpu_blocking_cur_gp() &&
+	    rcu_preempt_running_reader())
+		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from __rcu_process_callbacks() to
+ * handle that case.  Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+	if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+		rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	debug_rcu_head_queue(head);
+	head->func = func;
+	head->next = NULL;
+
+	local_irq_save(flags);
+	*rcu_preempt_ctrlblk.nexttail = head;
+	rcu_preempt_ctrlblk.nexttail = &head->next;
+	rcu_preempt_start_gp();  /* checks to see if GP needed. */
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+void rcu_barrier(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_rcu_head_on_stack(&rcu.head);
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (!rcu_scheduler_active)
+		return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+	WARN_ON_ONCE(rcu_preempt_running_reader());
+	if (!rcu_preempt_blocked_readers_any())
+		return;
+
+	/* Once we get past the fastpath checks, same code as rcu_barrier(). */
+	rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+	return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+	wake_up(&sync_rcu_preempt_exp_wq);
+}
+
+/*
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section.  Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list.  So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+	unsigned long flags;
+	struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+	unsigned long snap;
+
+	barrier(); /* ensure prior action seen before grace period. */
+
+	WARN_ON_ONCE(rcu_preempt_running_reader());
+
+	/*
+	 * Acquire lock so that there is only one preemptible RCU grace
+	 * period in flight.  Of course, if someone does the expedited
+	 * grace period for us while we are acquiring the lock, just leave.
+	 */
+	snap = sync_rcu_preempt_exp_count + 1;
+	mutex_lock(&sync_rcu_preempt_exp_mutex);
+	if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+		goto unlock_mb_ret; /* Others did our work for us. */
+
+	local_irq_save(flags);
+
+	/*
+	 * All RCU readers have to already be on blkd_tasks because
+	 * we cannot legally be executing in an RCU read-side critical
+	 * section.
+	 */
+
+	/* Snapshot current head of ->blkd_tasks list. */
+	rpcp->exp_tasks = rpcp->blkd_tasks.next;
+	if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+		rpcp->exp_tasks = NULL;
+	local_irq_restore(flags);
+
+	/* Wait for tail of ->blkd_tasks list to drain. */
+	if (rcu_preempted_readers_exp())
+		wait_event(sync_rcu_preempt_exp_wq,
+			   !rcu_preempted_readers_exp());
+
+	/* Clean up and exit. */
+	barrier(); /* ensure expedited GP seen before counter increment. */
+	sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+	mutex_unlock(&sync_rcu_preempt_exp_mutex);
+	barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+	if (!rcu_preempt_running_reader())
+		rcu_preempt_cpu_qs();
+	return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+	struct task_struct *t = current;
+
+	if (t->rcu_read_lock_nesting == 0)
+		return;
+	t->rcu_read_lock_nesting = 1;
+	rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 #include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b..9d8e8fb2515 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
 };
 
 static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
 static long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 #define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
 #define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
 static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex);	/* Protect fullstop transitions and spawning */
-				/*  of kthreads. */
+/*
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
 
 /*
  * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
 		mdelay(longdelay_ms);
 	if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
 		udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+	if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+		preempt_schedule();  /* No QS if preempt_disable() in effect */
+#endif
 }
 
 static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
 	delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
 	if (!delay)
 		schedule_timeout_interruptible(longdelay);
+	else
+		rcu_read_delay(rrsp);
 }
 
 static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
 			continue;
 		rp->rtort_pipe_count = 0;
 		udelay(rcu_random(&rand) & 0x3ff);
-		old_rp = rcu_torture_current;
+		old_rp = rcu_dereference_check(rcu_torture_current,
+					       current == writer_task);
 		rp->rtort_mbtest = 1;
 		rcu_assign_pointer(rcu_torture_current, rp);
 		smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5..ccdc04c4798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
 
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
 
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 
-int rcu_cpu_stall_panicking __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly;
 
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	rcu_print_task_stall(rnp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
-	/* OK, time to rat on our buddy... */
-
+	/*
+	 * OK, time to rat on our buddy...
+	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * RCU CPU stall warnings.
+	 */
 	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
 	       rsp->name);
 	rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
+	/*
+	 * OK, time to rat on ourselves...
+	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * RCU CPU stall warnings.
+	 */
 	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
 	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
 	trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	long delta;
 	struct rcu_node *rnp;
 
-	if (rcu_cpu_stall_panicking)
+	if (rcu_cpu_stall_suppress)
 		return;
-	delta = jiffies - rsp->jiffies_stall;
+	delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
 	rnp = rdp->mynode;
-	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+	if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
 
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 
 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 {
-	rcu_cpu_stall_panicking = 1;
+	rcu_cpu_stall_suppress = 1;
 	return NOTIFY_DONE;
 }
 
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+	rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+	rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+	rcu_preempt_stall_reset();
+}
+
 static struct notifier_block rcu_panic_block = {
 	.notifier_call = rcu_panic,
 };
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
 
+void rcu_cpu_stall_reset(void)
+{
+}
+
 static void __init check_cpu_stall_init(void)
 {
 }
@@ -712,7 +745,7 @@ static void
 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
-	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
 {
 	int i;
-	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
 	if (rdp->nxtlist == NULL)
 		return;  /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
 	rsp->orphan_qlen += rdp->qlen;
+	rdp->n_cbs_orphaned += rdp->qlen;
 	rdp->qlen = 0;
 	raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 
 	raw_spin_lock_irqsave(&rsp->onofflock, flags);
-	rdp = rsp->rda[smp_processor_id()];
+	rdp = this_cpu_ptr(rsp->rda);
 	if (rsp->orphan_cbs_list == NULL) {
 		raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 		return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 	*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
 	rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
 	rdp->qlen += rsp->orphan_qlen;
+	rdp->n_cbs_adopted += rsp->orphan_qlen;
 	rsp->orphan_cbs_list = NULL;
 	rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
 	rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	unsigned long flags;
 	unsigned long mask;
 	int need_report = 0;
-	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp;
 
 	/* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Update count, and requeue any remaining callbacks. */
 	rdp->qlen -= count;
+	rdp->n_cbs_invoked += count;
 	if (list != NULL) {
 		*tail = rdp->nxtlist;
 		rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 		cpu = rnp->grplo;
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-			if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+			if ((rnp->qsmask & bit) != 0 &&
+			    f(per_cpu_ptr(rsp->rda, cpu)))
 				mask |= bit;
 		}
 		if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	 * a quiescent state betweentimes.
 	 */
 	local_irq_save(flags);
-	rdp = rsp->rda[smp_processor_id()];
+	rdp = this_cpu_ptr(rsp->rda);
 	rcu_process_gp_end(rsp, rdp);
 	check_for_new_grace_period(rsp, rdp);
 
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
 	int i;
-	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 {
 	unsigned long flags;
 	unsigned long mask;
-	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.
  */
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+		struct rcu_data __percpu *rda)
 {
 	static char *buf[] = { "rcu_node_level_0",
 			       "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		}
 	}
 
+	rsp->rda = rda;
 	rnp = rsp->level[NUM_RCU_LVLS - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
 			rnp++;
-		rsp->rda[i]->mynode = rnp;
+		per_cpu_ptr(rsp->rda, i)->mynode = rnp;
 		rcu_boot_init_percpu_data(i, rsp);
 	}
 }
 
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt().  To be used
- * nowhere else!  Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
-	int i; \
-	\
-	for_each_possible_cpu(i) { \
-		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
-	} \
-	rcu_init_one(rsp); \
-} while (0)
-
 void __init rcu_init(void)
 {
 	int cpu;
 
 	rcu_bootup_announce();
-	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
-	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	__rcu_init_preempt();
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed..91d4170c5c1 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
 	long		qlen;		/* # of queued callbacks */
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
+	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
+	unsigned long	n_cbs_orphaned;	/* RCU cbs sent to orphanage. */
+	unsigned long	n_cbs_adopted;	/* RCU cbs adopted from orphanage. */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
 #define RCU_STALL_DELAY_DELTA	       0
 #endif
 
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+					RCU_STALL_DELAY_DELTA)
 						/* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
 						/* for rsp->jiffies_stall */
 #define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */
 						/*  to take at least one */
 						/*  scheduling clock irq */
 						/*  before ratting on them. */
 
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define RCU_CPU_STALL_SUPPRESS_INIT 0
+#else
+#define RCU_CPU_STALL_SUPPRESS_INIT 1
+#endif
 
-#define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
-#define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /*
  * RCU global state, including node hierarchy.  This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
 	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
 	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
 	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
-	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
 
 	/* The following fields are guarded by the root rcu_node's lock. */
 
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d..71a4147473f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
 	printk(KERN_INFO
 	       "\tRCU-based detection of stalled CPUs is disabled.\n");
 #endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
 	printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
 	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
 
 		/* Possibly blocking in an RCU read-side critical section. */
-		rdp = rcu_preempt_state.rda[cpu];
+		rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
 		rnp = rdp->mynode;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
  */
 void __rcu_read_lock(void)
 {
-	ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+	current->rcu_read_lock_nesting++;
 	barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
 	struct task_struct *t = current;
 
 	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
-	if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+	--t->rcu_read_lock_nesting;
+	barrier();  /* decrement before load of ->rcu_read_unlock_special */
+	if (t->rcu_read_lock_nesting == 0 &&
 	    unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
 		rcu_read_unlock_special(t);
 #ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
 	}
 }
 
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+	rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
+
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
  *
  * Control will return to the caller some time after a full grace
  * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+ * read-side critical sections have completed.  Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting.  RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
  */
 void synchronize_rcu(void)
 {
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
  */
 static void __init __rcu_init_preempt(void)
 {
-	RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
 
 /*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
 
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
+
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
 }
 
 /*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-	call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/*
  * Wait for an rcu-preempt grace period, but make it happen quickly.
  * But because preemptable RCU does not exist, map to rcu-sched.
  */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738..d15430b9d12 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
 	seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-	seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+	seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+	seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+		   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 
 #define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
 	seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-	seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+	seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+	seq_printf(m, ",%lu,%lu,%lu\n",
+		   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 
 static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 #ifdef CONFIG_NO_HZ
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	seq_puts(m, "\"rcu_preempt:\"\n");
 	PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_data *rdp;
 
 	for_each_possible_cpu(cpu) {
-		rdp = rsp->rda[cpu];
+		rdp = per_cpu_ptr(rsp->rda, cpu);
 		if (rdp->beenonline)
 			print_one_rcu_pending(m, rdp);
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb9083..d42992bccdf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
 	 */
 	cpumask_var_t rto_mask;
 	atomic_t rto_count;
-#ifdef CONFIG_SMP
 	struct cpupri cpupri;
-#endif
 };
 
 /*
@@ -437,7 +435,7 @@ struct root_domain {
  */
 static struct root_domain def_root_domain;
 
-#endif
+#endif /* CONFIG_SMP */
 
 /*
  * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
-	struct task_struct *curr, *idle;
+	struct task_struct *curr, *idle, *stop;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
 	u64 clock;
+	u64 clock_task;
 
 	atomic_t nr_iowait;
 
@@ -520,6 +519,10 @@ struct rq {
 	u64 avg_idle;
 #endif
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	u64 prev_irq_time;
+#endif
+
 	/* calc_load related fields */
 	unsigned long calc_load_update;
 	long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
 
 #endif /* CONFIG_CGROUP_SCHED */
 
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
 inline void update_rq_clock(struct rq *rq)
 {
-	if (!rq->skip_clock_update)
-		rq->clock = sched_clock_cpu(cpu_of(rq));
+	if (!rq->skip_clock_update) {
+		int cpu = cpu_of(rq);
+		u64 irq_time;
+
+		rq->clock = sched_clock_cpu(cpu);
+		irq_time = irq_time_cpu(cpu);
+		if (rq->clock - irq_time > rq->clock_task)
+			rq->clock_task = rq->clock - irq_time;
+
+		sched_irq_time_avg_update(rq, irq_time);
+	}
 }
 
 /*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	char buf[64];
-	char *cmp = buf;
+	char *cmp;
 	int neg = 0;
 	int i;
 
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 		return -EFAULT;
 
 	buf[cnt] = 0;
+	cmp = strstrip(buf);
 
 	if (strncmp(buf, "NO_", 3) == 0) {
 		neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 	}
 
 	for (i = 0; sched_feat_names[i]; i++) {
-		int len = strlen(sched_feat_names[i]);
-
-		if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+		if (strcmp(cmp, sched_feat_names[i]) == 0) {
 			if (neg)
 				sysctl_sched_features &= ~(1UL << i);
 			else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 static const struct sched_class rt_sched_class;
 
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
 
 static void set_load_weight(struct task_struct *p)
 {
-	if (task_has_rt_policy(p)) {
-		p->se.load.weight = 0;
-		p->se.load.inv_weight = WMULT_CONST;
-		return;
-	}
-
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	dec_nr_running(rq);
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+	if (!sched_clock_irqtime)
+		return 0;
+
+	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+	unsigned long flags;
+	int cpu;
+	u64 now, delta;
+
+	if (!sched_clock_irqtime)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	now = sched_clock_cpu(cpu);
+	delta = now - per_cpu(irq_start_time, cpu);
+	per_cpu(irq_start_time, cpu) = now;
+	/*
+	 * We do not account for softirq time from ksoftirqd here.
+	 * We want to continue accounting softirq time to ksoftirqd thread
+	 * in that case, so as not to confuse scheduler with a special task
+	 * that do not consume any time, but still wants to run.
+	 */
+	if (hardirq_count())
+		per_cpu(cpu_hardirq_time, cpu) += delta;
+	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+		per_cpu(cpu_softirq_time, cpu) += delta;
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+		u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+		rq->prev_irq_time = curr_irq_time;
+		sched_rt_avg_update(rq, delta_irq);
+	}
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+	return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
 
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+	struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+	if (stop) {
+		/*
+		 * Make it appear like a SCHED_FIFO task, its something
+		 * userspace knows about and won't get confused about.
+		 *
+		 * Also, it will make PI more or less work without too
+		 * much confusion -- but then, stop work should not
+		 * rely on PI working anyway.
+		 */
+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+		stop->sched_class = &stop_sched_class;
+	}
+
+	cpu_rq(cpu)->stop = stop;
+
+	if (old_stop) {
+		/*
+		 * Reset it back to a normal scheduling class so that
+		 * it can die in pieces.
+		 */
+		old_stop->sched_class = &rt_sched_class;
+	}
+}
+
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
+	if (unlikely(p->policy == SCHED_IDLE))
+		return 0;
+
 	/*
 	 * Buddy candidates are cache hot:
 	 */
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 */
 	arch_start_context_switch(prev);
 
-	if (likely(!mm)) {
+	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 
-	if (likely(!prev->mm)) {
+	if (!prev->mm) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 
 	if (task_current(rq, p)) {
 		update_rq_clock(rq);
-		ns = rq->clock - p->se.exec_start;
+		ns = rq->clock_task - p->se.exec_start;
 		if ((s64)ns < 0)
 			ns = 0;
 	}
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
-	else if (softirq_count())
+	else if (in_serving_softirq())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	else
 		cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3584,7 +3714,7 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	raw_spin_unlock(&rq->lock);
 
-	perf_event_task_tick(curr);
+	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
 			return p;
 	}
 
-	class = sched_class_highest;
-	for ( ; ; ) {
+	for_each_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
 			return p;
-		/*
-		 * Will never be NULL as the idle class always
-		 * returns a non-NULL p:
-		 */
-		class = class->next;
 	}
+
+	BUG(); /* the idle class will always have a runnable task */
 }
 
 /*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	rq = task_rq_lock(p, &flags);
 
+	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
 	on_rq = p->se.on_rq;
@@ -4645,7 +4772,7 @@ recheck:
 	}
 
 	if (user) {
-		retval = security_task_setscheduler(p, policy, param);
+		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
 	}
@@ -4661,6 +4788,15 @@ recheck:
 	 */
 	rq = __task_rq_lock(p);
 
+	/*
+	 * Changing the policy of the stop threads its a very bad idea
+	 */
+	if (p == rq->stop) {
+		__task_rq_unlock(rq);
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return -EINVAL;
+	}
+
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (user) {
 		/*
@@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
-	retval = security_task_setscheduler(p, 0, NULL);
+	retval = security_task_setscheduler(p);
 	if (retval)
 		goto out_unlock;
 
 	cpuset_cpus_allowed(p, cpus_allowed);
 	cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
 	retval = set_cpus_allowed_ptr(p, new_mask);
 
 	if (!retval) {
@@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	idle->se.exec_start = sched_clock();
 
 	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+	/*
+	 * We're having a chicken and egg problem, even though we are
+	 * holding rq->lock, the cpu isn't yet set to this cpu so the
+	 * lockdep check in task_group() will fail.
+	 *
+	 * Similar case to sched_fork(). / Alternatively we could
+	 * use task_rq_lock() here and obtain the other rq->lock.
+	 *
+	 * Silence PROVE_RCU
+	 */
+	rcu_read_lock();
 	__set_task_cpu(idle, cpu);
+	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6514,6 +6662,7 @@ struct s_data {
 	cpumask_var_t		nodemask;
 	cpumask_var_t		this_sibling_map;
 	cpumask_var_t		this_core_map;
+	cpumask_var_t		this_book_map;
 	cpumask_var_t		send_covered;
 	cpumask_var_t		tmpmask;
 	struct sched_group	**sched_group_nodes;
@@ -6525,6 +6674,7 @@ enum s_alloc {
 	sa_rootdomain,
 	sa_tmpmask,
 	sa_send_covered,
+	sa_this_book_map,
 	sa_this_core_map,
 	sa_this_sibling_map,
 	sa_nodemask,
@@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
 
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
-
+#ifdef CONFIG_SCHED_SMT
 	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
+#else
+	group = cpu;
+#endif
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group).sg;
 	return group;
 }
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+
 static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
-		  struct sched_group **sg, struct cpumask *unused)
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *mask)
 {
+	int group = cpu;
+#ifdef CONFIG_SCHED_MC
+	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+	group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+	group = cpumask_first(mask);
+#endif
 	if (sg)
-		*sg = &per_cpu(sched_group_core, cpu).sg;
-	return cpu;
+		*sg = &per_cpu(sched_group_book, group).sg;
+	return group;
 }
-#endif
+#endif /* CONFIG_SCHED_BOOK */
 
 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+	cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+	group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
 	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
@@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU)
 #ifdef CONFIG_SCHED_MC
  SD_INIT_FUNC(MC)
 #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
 
 static int default_relax_domain_level = -1;
 
@@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 		free_cpumask_var(d->tmpmask); /* fall through */
 	case sa_send_covered:
 		free_cpumask_var(d->send_covered); /* fall through */
+	case sa_this_book_map:
+		free_cpumask_var(d->this_book_map); /* fall through */
 	case sa_this_core_map:
 		free_cpumask_var(d->this_core_map); /* fall through */
 	case sa_this_sibling_map:
@@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 		return sa_nodemask;
 	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
 		return sa_this_sibling_map;
-	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
 		return sa_this_core_map;
+	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+		return sa_this_book_map;
 	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
 		return sa_send_covered;
 	d->rd = alloc_rootdomain();
@@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
 	return sd;
 }
 
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+	struct sched_domain *parent, int i)
+{
+	struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+	sd = &per_cpu(book_domains, i).sd;
+	SD_INIT(sd, BOOK);
+	set_domain_attribute(sd, attr);
+	cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+	sd->parent = parent;
+	parent->child = sd;
+	cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+	return sd;
+}
+
 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
 	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 	struct sched_domain *parent, int i)
@@ -7066,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
 						d->send_covered, d->tmpmask);
 		break;
 #endif
+#ifdef CONFIG_SCHED_BOOK
+	case SD_LV_BOOK: /* set up book groups */
+		cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+		if (cpu == cpumask_first(d->this_book_map))
+			init_sched_build_groups(d->this_book_map, cpu_map,
+						&cpu_to_book_group,
+						d->send_covered, d->tmpmask);
+		break;
+#endif
 	case SD_LV_CPU: /* set up physical groups */
 		cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
 		if (!cpumask_empty(d->nodemask))
@@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
 		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+		sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
 	}
 
 	for_each_cpu(i, cpu_map) {
 		build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+		build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
 		build_sched_groups(&d, SD_LV_MC, cpu_map, i);
 	}
 
@@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		init_sched_groups_power(i, sd);
 	}
 #endif
+#ifdef CONFIG_SCHED_BOOK
+	for_each_cpu(i, cpu_map) {
+		sd = &per_cpu(book_domains, i).sd;
+		init_sched_groups_power(i, sd);
+	}
+#endif
 
 	for_each_cpu(i, cpu_map) {
 		sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
 		sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+		sd = &per_cpu(book_domains, i).sd;
 #else
 		sd = &per_cpu(phys_domains, i).sd;
 #endif
@@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 
 	return 1;
 
- err_free_rq:
+err_free_rq:
 	kfree(cfs_rq);
- err:
+err:
 	return 0;
 }
 
@@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 	return 1;
 
- err_free_rq:
+err_free_rq:
 	kfree(rt_rq);
- err:
+err:
 	return 0;
 }
 
@@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674ca49..933f3d1b62e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_of(cfs_rq)->clock;
+	u64 now = rq_of(cfs_rq)->clock_task;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	/*
 	 * We are starting a new run period:
 	 */
-	se->exec_start = rq_of(cfs_rq)->clock;
+	se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 
 /**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	check_preempt_curr(this_rq, p, 0);
+
+	/* re-arm NEWIDLE balancing when moving tasks */
+	src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+	this_rq->idle_stamp = 0;
 }
 
 /*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	tsk_cache_hot = task_hot(p, rq->clock, sd);
+	tsk_cache_hot = task_hot(p, rq->clock_task, sd);
 	if (!tsk_cache_hot ||
 		sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
 	unsigned long this_load;
 	unsigned long this_load_per_task;
 	unsigned long this_nr_running;
+	unsigned long this_has_capacity;
 
 	/* Statistics of the busiest group */
 	unsigned long max_load;
 	unsigned long busiest_load_per_task;
 	unsigned long busiest_nr_running;
 	unsigned long busiest_group_capacity;
+	unsigned long busiest_has_capacity;
 
 	int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 	unsigned long group_capacity;
 	int group_imb; /* Is there an imbalance in the group ? */
+	int group_has_capacity; /* Is there extra capacity in the group? */
 };
 
 /**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
 	u64 total, available;
 
 	total = sched_avg_period() + (rq->clock - rq->age_stamp);
-	available = total - rq->rt_avg;
+
+	if (unlikely(total < rq->rt_avg)) {
+		/* Ensures that power won't end up being negative */
+		available = 0;
+	} else {
+		available = total - rq->rt_avg;
+	}
 
 	if (unlikely((s64)total < SCHED_LOAD_SCALE))
 		total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 			int local_group, const struct cpumask *cpus,
 			int *balance, struct sg_lb_stats *sgs)
 {
-	unsigned long load, max_cpu_load, min_cpu_load;
+	unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
 	int i;
 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
 	unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	/* Tally up the load of all CPUs in the group */
 	max_cpu_load = 0;
 	min_cpu_load = ~0UL;
+	max_nr_running = 0;
 
 	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 		struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 			load = target_load(i, load_idx);
 		} else {
 			load = source_load(i, load_idx);
-			if (load > max_cpu_load)
+			if (load > max_cpu_load) {
 				max_cpu_load = load;
+				max_nr_running = rq->nr_running;
+			}
 			if (min_cpu_load > load)
 				min_cpu_load = load;
 		}
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if (sgs->sum_nr_running)
 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
-	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity =
-		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
+
+	if (sgs->group_capacity > sgs->sum_nr_running)
+		sgs->group_has_capacity = 1;
 }
 
 /**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		/*
 		 * In case the child domain prefers tasks go to siblings
 		 * first, lower the sg capacity to one so that we'll try
-		 * and move all the excess tasks away.
+		 * and move all the excess tasks away. We lower the capacity
+		 * of a group only if the local group has the capacity to fit
+		 * these excess tasks, i.e. nr_running < group_capacity. The
+		 * extra check prevents the case where you always pull from the
+		 * heaviest group when it is already under-utilized (possible
+		 * with a large weight task outweighs the tasks on the system).
 		 */
-		if (prefer_sibling)
+		if (prefer_sibling && !local_group && sds->this_has_capacity)
 			sgs.group_capacity = min(sgs.group_capacity, 1UL);
 
 		if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			sds->this = sg;
 			sds->this_nr_running = sgs.sum_nr_running;
 			sds->this_load_per_task = sgs.sum_weighted_load;
+			sds->this_has_capacity = sgs.group_has_capacity;
 		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
 			sds->max_load = sgs.avg_load;
 			sds->busiest = sg;
 			sds->busiest_nr_running = sgs.sum_nr_running;
 			sds->busiest_group_capacity = sgs.group_capacity;
 			sds->busiest_load_per_task = sgs.sum_weighted_load;
+			sds->busiest_has_capacity = sgs.group_has_capacity;
 			sds->group_imb = sgs.group_imb;
 		}
 
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 		return fix_small_imbalance(sds, this_cpu, imbalance);
 
 }
+
 /******* find_busiest_group() helpers end here *********************/
 
 /**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * 4) This group is more busy than the avg busieness at this
 	 *    sched_domain.
 	 * 5) The imbalance is within the specified limit.
+	 *
+	 * Note: when doing newidle balance, if the local group has excess
+	 * capacity (i.e. nr_running < group_capacity) and the busiest group
+	 * does not have any capacity, we force a load balance to pull tasks
+	 * to the local group. In this case, we skip past checks 3, 4 and 5.
 	 */
 	if (!(*balance))
 		goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	/*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+			!sds.busiest_has_capacity)
+		goto force_balance;
+
 	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
 
+force_balance:
 	/* Looks like there is an imbalance. Compute it */
 	calculate_imbalance(&sds, this_cpu, imbalance);
 	return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
 
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
-		sd->nr_balance_failed++;
+		/*
+		 * Increment the failure counter only on periodic balance.
+		 * We do not want newidle balance, which can be very
+		 * frequent, pollute the failure counter causing
+		 * excessive cache_hot migrations and active balances.
+		 */
+		if (idle != CPU_NEWLY_IDLE)
+			sd->nr_balance_failed++;
 
 		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
 					this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
-		if (pulled_task) {
-			this_rq->idle_stamp = 0;
+		if (pulled_task)
 			break;
-		}
 	}
 
 	raw_spin_lock(&this_rq->lock);
@@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
 
 	update_rq_clock(rq);
 
-	if (unlikely(task_cpu(p) != this_cpu))
+	if (unlikely(task_cpu(p) != this_cpu)) {
+		rcu_read_lock();
 		__set_task_cpu(p, this_cpu);
+		rcu_read_unlock();
+	}
 
 	update_curr(cfs_rq);
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3e..185f920ec1a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
  * release the lock. Decreases scheduling overhead.
  */
 SCHED_FEAT(OWNER_SPIN, 1)
+
+/*
+ * Decrement CPU power based on irq activity
+ */
+SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67..bea7d79f7e9 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
 	if (!task_has_rt_policy(curr))
 		return;
 
-	delta_exec = rq->clock - curr->se.exec_start;
+	delta_exec = rq->clock_task - curr->se.exec_start;
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.sum_exec_runtime += delta_exec;
 	account_group_exec_runtime(curr, delta_exec);
 
-	curr->se.exec_start = rq->clock;
+	curr->se.exec_start = rq->clock_task;
 	cpuacct_charge(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 	 * runqueue. Otherwise simply start this RT task
 	 * on its current runqueue.
 	 *
-	 * We want to avoid overloading runqueues. Even if
-	 * the RT task is of higher priority than the current RT task.
-	 * RT tasks behave differently than other tasks. If
-	 * one gets preempted, we try to push it off to another queue.
-	 * So trying to keep a preempting RT task on the same
-	 * cache hot CPU will force the running RT task to
-	 * a cold CPU. So we waste all the cache for the lower
-	 * RT task in hopes of saving some of a RT task
-	 * that is just being woken and probably will have
-	 * cold cache anyway.
+	 * We want to avoid overloading runqueues. If the woken
+	 * task is a higher priority, then it will stay on this CPU
+	 * and the lower prio task should be moved to another CPU.
+	 * Even though this will probably make the lower prio task
+	 * lose its cache, we do not want to bounce a higher task
+	 * around just because it gave up its CPU, perhaps for a
+	 * lock?
+	 *
+	 * For equal prio tasks, we just let the scheduler sort it out.
 	 */
 	if (unlikely(rt_task(rq->curr)) &&
+	    (rq->curr->rt.nr_cpus_allowed < 2 ||
+	     rq->curr->prio < p->prio) &&
 	    (p->rt.nr_cpus_allowed > 1)) {
 		int cpu = find_lowest_rq(p);
 
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	} while (rt_rq);
 
 	p = rt_task_of(rt_se);
-	p->se.exec_start = rq->clock;
+	p->se.exec_start = rq->clock_task;
 
 	return p;
 }
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	for_each_leaf_rt_rq(rt_rq, rq) {
 		array = &rt_rq->active;
 		idx = sched_find_first_bit(array->bitmap);
- next_idx:
+next_idx:
 		if (idx >= MAX_RT_PRIO)
 			continue;
 		if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
 	if (!next_task)
 		return 0;
 
- retry:
+retry:
 	if (unlikely(next_task == rq->curr)) {
 		WARN_ON(1);
 		return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * but possible)
 			 */
 		}
- skip:
+skip:
 		double_unlock_balance(this_rq, src_rq);
 	}
 
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
 	    has_pushable_tasks(rq) &&
-	    p->rt.nr_cpus_allowed > 1)
+	    p->rt.nr_cpus_allowed > 1 &&
+	    rt_task(rq->curr) &&
+	    (rq->curr->rt.nr_cpus_allowed < 2 ||
+	     rq->curr->prio < p->prio))
 		push_rt_tasks(rq);
 }
 
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
 
-	p->se.exec_start = rq->clock;
+	p->se.exec_start = rq->clock_task;
 
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 00000000000..45bddc0c104
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct rq *rq, struct task_struct *p,
+		    int sd_flag, int flags)
+{
+	return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+	resched_task(rq->curr); /* we preempt everything */
+}
+
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+	struct task_struct *stop = rq->stop;
+
+	if (stop && stop->state == TASK_RUNNING)
+		return stop;
+
+	return NULL;
+}
+
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void yield_task_stop(struct rq *rq)
+{
+	BUG(); /* the stop task should never yield, its pointless. */
+}
+
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_stop(struct rq *rq)
+{
+}
+
+static void switched_to_stop(struct rq *rq, struct task_struct *p,
+			     int running)
+{
+	BUG(); /* its impossible to change to this class */
+}
+
+static void prio_changed_stop(struct rq *rq, struct task_struct *p,
+			      int oldprio, int running)
+{
+	BUG(); /* how!?, what priority? */
+}
+
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+	return 0;
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+static const struct sched_class stop_sched_class = {
+	.next			= &rt_sched_class,
+
+	.enqueue_task		= enqueue_task_stop,
+	.dequeue_task		= dequeue_task_stop,
+	.yield_task		= yield_task_stop,
+
+	.check_preempt_curr	= check_preempt_curr_stop,
+
+	.pick_next_task		= pick_next_task_stop,
+	.put_prev_task		= put_prev_task_stop,
+
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_stop,
+#endif
+
+	.set_curr_task          = set_curr_task_stop,
+	.task_tick		= task_tick_stop,
+
+	.get_rr_interval	= get_rr_interval_stop,
+
+	.prio_changed		= prio_changed_stop,
+	.switched_to		= switched_to_stop,
+
+	/* no .task_new for stop tasks */
+};
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73..79ee8f1fc0e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
 }
 
 /*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ *   softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ *   on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+
+/*
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
  */
 #ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip)
+static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
 	unsigned long flags;
 
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
 	 * We must manually increment preempt_count here and manually
 	 * call the trace_preempt_off later.
 	 */
-	preempt_count() += SOFTIRQ_OFFSET;
+	preempt_count() += cnt;
 	/*
 	 * Were softirqs turned off above:
 	 */
-	if (softirq_count() == SOFTIRQ_OFFSET)
+	if (softirq_count() == cnt)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
 
-	if (preempt_count() == SOFTIRQ_OFFSET)
+	if (preempt_count() == cnt)
 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip)
+static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
-	add_preempt_count(SOFTIRQ_OFFSET);
+	add_preempt_count(cnt);
 	barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
 
 void local_bh_disable(void)
 {
-	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	__local_bh_disable((unsigned long)__builtin_return_address(0),
+				SOFTIRQ_DISABLE_OFFSET);
 }
 
 EXPORT_SYMBOL(local_bh_disable);
 
+static void __local_bh_enable(unsigned int cnt)
+{
+	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(!irqs_disabled());
+
+	if (softirq_count() == cnt)
+		trace_softirqs_on((unsigned long)__builtin_return_address(0));
+	sub_preempt_count(cnt);
+}
+
 /*
  * Special-case - softirqs can safely be enabled in
  * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
  */
 void _local_bh_enable(void)
 {
-	WARN_ON_ONCE(in_irq());
-	WARN_ON_ONCE(!irqs_disabled());
-
-	if (softirq_count() == SOFTIRQ_OFFSET)
-		trace_softirqs_on((unsigned long)__builtin_return_address(0));
-	sub_preempt_count(SOFTIRQ_OFFSET);
+	__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
 }
 
 EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
-	if (softirq_count() == SOFTIRQ_OFFSET)
+	if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
 		trace_softirqs_on(ip);
 	/*
 	 * Keep preemption disabled until we are done with
 	 * softirq processing:
  	 */
- 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+	sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
 
 	if (unlikely(!in_interrupt() && local_softirq_pending()))
 		do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
 	pending = local_softirq_pending();
 	account_system_vtime(current);
 
-	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	__local_bh_disable((unsigned long)__builtin_return_address(0),
+				SOFTIRQ_OFFSET);
 	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
 	lockdep_softirq_exit();
 
 	account_system_vtime(current);
-	_local_bh_enable();
+	__local_bh_enable(SOFTIRQ_OFFSET);
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
 
 	rcu_irq_enter();
 	if (idle_cpu(cpu) && !in_interrupt()) {
-		__irq_enter();
+		/*
+		 * Prevent raise_softirq from needlessly waking up ksoftirqd
+		 * here, as softirq will be serviced on return from interrupt.
+		 */
+		local_bh_disable();
 		tick_check_idle(cpu);
-	} else
-		__irq_enter();
+		_local_bh_enable();
+	}
+
+	__irq_enter();
 }
 
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 {
 	set_current_state(TASK_INTERRUPTIBLE);
 
+	current->flags |= PF_KSOFTIRQD;
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd50..c71e0750053 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 int __init_srcu_struct(struct srcu_struct *sp, const char *name,
 		       struct lock_class_key *key)
 {
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/* Don't re-initialize a lock while it is held. */
 	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
 	lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 	return init_srcu_struct_fields(sp);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb2512..090c28812ce 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
 	goto repeat;
 }
 
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+
 /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
 static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 					   unsigned long action, void *hcpu)
 {
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	unsigned int cpu = (unsigned long)hcpu;
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 				   cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		get_task_struct(p);
+		kthread_bind(p, cpu);
+		sched_set_stop_task(cpu, p);
 		stopper->thread = p;
 		break;
 
 	case CPU_ONLINE:
-		kthread_bind(stopper->thread, cpu);
 		/* strictly unnecessary, as first user will wake it */
 		wake_up_process(stopper->thread);
 		/* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 	{
 		struct cpu_stop_work *work;
 
+		sched_set_stop_task(cpu, NULL);
 		/* kill the stopper */
 		kthread_stop(stopper->thread);
 		/* drain remaining works */
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19..f8b11a28317 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
 	int ret;
 	struct kprobe *kps[2] = {&kp, &kp2};
 
-	kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+	/* addr and flags should be cleard for reusing kprobe. */
+	kp.addr = NULL;
+	kp.flags = 0;
 	ret = register_kprobes(kps, 2);
 	if (ret < 0) {
 		printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
 	int ret;
 	struct jprobe *jps[2] = {&jp, &jp2};
 
-	jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+	/* addr and flags should be cleard for reusing kprobe. */
+	jp.kp.addr = NULL;
+	jp.kp.flags = 0;
 	ret = register_jprobes(jps, 2);
 	if (ret < 0) {
 		printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
 	int ret;
 	struct kretprobe *rps[2] = {&rp, &rp2};
 
-	rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+	/* addr and flags should be cleard for reusing kprobe. */
+	rp.kp.addr = NULL;
+	rp.kp.flags = 0;
 	ret = register_kretprobes(rps, 2);
 	if (ret < 0) {
 		printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade..68a9ae7679b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
 	printk_tick();
-	perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+	if (in_irq())
+		irq_work_run();
+#endif
 	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea5..e550d2eda1d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
 	help
 	  See Documentation/trace/ftrace-design.txt
 
+config HAVE_C_RECORDMCOUNT
+	bool
+	help
+	  C version of recordmcount available?
+
 config TRACER_MAX_TRACE
 	bool
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe..ebd80d50c47 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_ENABLE_MCOUNT		= (1 << 3),
-	FTRACE_DISABLE_MCOUNT		= (1 << 4),
-	FTRACE_START_FUNC_RET		= (1 << 5),
-	FTRACE_STOP_FUNC_RET		= (1 << 6),
+	FTRACE_START_FUNC_RET		= (1 << 3),
+	FTRACE_STOP_FUNC_RET		= (1 << 4),
 };
 
 static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
 
 static void ftrace_startup_sysctl(void)
 {
-	int command = FTRACE_ENABLE_MCOUNT;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
 	saved_ftrace_func = NULL;
 	/* ftrace_start_up is true if we want ftrace running */
 	if (ftrace_start_up)
-		command |= FTRACE_ENABLE_CALLS;
-
-	ftrace_run_update_code(command);
+		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 }
 
 static void ftrace_shutdown_sysctl(void)
 {
-	int command = FTRACE_DISABLE_MCOUNT;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
 	/* ftrace_start_up is true if ftrace is running */
 	if (ftrace_start_up)
-		command |= FTRACE_DISABLE_CALLS;
-
-	ftrace_run_update_code(command);
+		ftrace_run_update_code(FTRACE_DISABLE_CALLS);
 }
 
 static cycle_t		ftrace_update_time;
@@ -1368,24 +1358,29 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
-	struct ftrace_page	*pg;
-	int			hidx;
-	int			idx;
-	unsigned		flags;
-	struct trace_parser	parser;
+	loff_t				pos;
+	loff_t				func_pos;
+	struct ftrace_page		*pg;
+	struct dyn_ftrace		*func;
+	struct ftrace_func_probe	*probe;
+	struct trace_parser		parser;
+	int				hidx;
+	int				idx;
+	unsigned			flags;
 };
 
 static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct hlist_node *hnd = v;
+	struct hlist_node *hnd = NULL;
 	struct hlist_head *hhd;
 
-	WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
-
 	(*pos)++;
+	iter->pos = *pos;
 
+	if (iter->probe)
+		hnd = &iter->probe->node;
  retry:
 	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
 		return NULL;
@@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
 		}
 	}
 
-	return hnd;
+	if (WARN_ON_ONCE(!hnd))
+		return NULL;
+
+	iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+
+	return iter;
 }
 
 static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 	void *p = NULL;
 	loff_t l;
 
-	if (!(iter->flags & FTRACE_ITER_HASH))
-		*pos = 0;
-
-	iter->flags |= FTRACE_ITER_HASH;
+	if (iter->func_pos > *pos)
+		return NULL;
 
 	iter->hidx = 0;
-	for (l = 0; l <= *pos; ) {
-		p = t_hash_next(m, p, &l);
+	for (l = 0; l <= (*pos - iter->func_pos); ) {
+		p = t_hash_next(m, &l);
 		if (!p)
 			break;
 	}
-	return p;
+	if (!p)
+		return NULL;
+
+	/* Only set this if we have an item */
+	iter->flags |= FTRACE_ITER_HASH;
+
+	return iter;
 }
 
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
 {
 	struct ftrace_func_probe *rec;
-	struct hlist_node *hnd = v;
 
-	rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+	rec = iter->probe;
+	if (WARN_ON_ONCE(!rec))
+		return -EIO;
 
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	struct dyn_ftrace *rec = NULL;
 
 	if (iter->flags & FTRACE_ITER_HASH)
-		return t_hash_next(m, v, pos);
+		return t_hash_next(m, pos);
 
 	(*pos)++;
+	iter->pos = *pos;
 
 	if (iter->flags & FTRACE_ITER_PRINTALL)
-		return NULL;
+		return t_hash_start(m, pos);
 
  retry:
 	if (iter->idx >= iter->pg->index) {
@@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		}
 	}
 
-	return rec;
+	if (!rec)
+		return t_hash_start(m, pos);
+
+	iter->func_pos = *pos;
+	iter->func = rec;
+
+	return iter;
+}
+
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+	iter->pos = 0;
+	iter->func_pos = 0;
+	iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
 }
 
 static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1502,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 
 	mutex_lock(&ftrace_lock);
 	/*
+	 * If an lseek was done, then reset and start from beginning.
+	 */
+	if (*pos < iter->pos)
+		reset_iter_read(iter);
+
+	/*
 	 * For set_ftrace_filter reading, if we have the filter
 	 * off, we can short cut and just print out that all
 	 * functions are enabled.
@@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	if (iter->flags & FTRACE_ITER_HASH)
 		return t_hash_start(m, pos);
 
+	/*
+	 * Unfortunately, we need to restart at ftrace_pages_start
+	 * every time we let go of the ftrace_mutex. This is because
+	 * those pointers can change without the lock.
+	 */
 	iter->pg = ftrace_pages_start;
 	iter->idx = 0;
 	for (l = 0; l <= *pos; ) {
@@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 			break;
 	}
 
-	if (!p && iter->flags & FTRACE_ITER_FILTER)
-		return t_hash_start(m, pos);
+	if (!p) {
+		if (iter->flags & FTRACE_ITER_FILTER)
+			return t_hash_start(m, pos);
 
-	return p;
+		return NULL;
+	}
+
+	return iter;
 }
 
 static void t_stop(struct seq_file *m, void *p)
@@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
 static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct dyn_ftrace *rec = v;
+	struct dyn_ftrace *rec;
 
 	if (iter->flags & FTRACE_ITER_HASH)
-		return t_hash_show(m, v);
+		return t_hash_show(m, iter);
 
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		seq_printf(m, "#### all functions enabled ####\n");
 		return 0;
 	}
 
+	rec = iter->func;
+
 	if (!rec)
 		return 0;
 
@@ -1601,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
 
 	ret = ftrace_avail_open(inode, file);
 	if (!ret) {
-		m = (struct seq_file *)file->private_data;
-		iter = (struct ftrace_iterator *)m->private;
+		m = file->private_data;
+		iter = m->private;
 		iter->flags = FTRACE_ITER_FAILURES;
 	}
 
@@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = no_llseek,
+	.llseek = ftrace_regex_lseek,
 	.release = ftrace_filter_release,
 };
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bca96377fd4..c5a632a669e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return local_read(&cpu_buffer->entries) -
+		(local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
+
 /**
  * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
  * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
-	ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
-		- cpu_buffer->read;
 
-	return ret;
+	return rb_num_of_entries(cpu_buffer);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
-		entries += (local_read(&cpu_buffer->entries) -
-			    local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
+		entries += rb_num_of_entries(cpu_buffer);
 	}
 
 	return entries;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f54115..001bcd2ccf4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 
 static int tracing_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	struct trace_iterator *iter;
 	int cpu;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a..9021f8c0c0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+		    unsigned long ip,
+		    unsigned long parent_ip,
+		    unsigned long flags, int pc);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 31cc4cb0dbf..39c059ca670 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-static char *perf_trace_buf[4];
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
 /*
  * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int	total_ref_count;
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
-	struct hlist_head *list;
+	struct hlist_head __percpu *list;
 	int ret = -ENOMEM;
 	int cpu;
 
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 	tp_event->perf_events = list;
 
 	if (!total_ref_count) {
-		char *buf;
+		char __percpu *buf;
 		int i;
 
-		for (i = 0; i < 4; i++) {
-			buf = (char *)alloc_percpu(perf_trace_t);
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+			buf = (char __percpu *)alloc_percpu(perf_trace_t);
 			if (!buf)
 				goto fail;
 
@@ -65,7 +65,7 @@ fail:
 	if (!total_ref_count) {
 		int i;
 
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
 			perf_trace_buf[i] = NULL;
 		}
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
 	return ret;
 }
 
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct hlist_head __percpu *pcpu_list;
 	struct hlist_head *list;
 
-	list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!list))
+	pcpu_list = tp_event->perf_events;
+	if (WARN_ON_ONCE(!pcpu_list))
 		return -EINVAL;
 
-	list = this_cpu_ptr(list);
+	if (!(flags & PERF_EF_START))
+		p_event->hw.state = PERF_HES_STOPPED;
+
+	list = this_cpu_ptr(pcpu_list);
 	hlist_add_head_rcu(&p_event->hlist_entry, list);
 
 	return 0;
 }
 
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	hlist_del_rcu(&p_event->hlist_entry);
 }
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 	tp_event->perf_events = NULL;
 
 	if (!--total_ref_count) {
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
 			perf_trace_buf[i] = NULL;
 		}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f14632..398c0e8b332 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
 
 enum {
 	FORMAT_HEADER		= 1,
-	FORMAT_PRINTFMT		= 2,
+	FORMAT_FIELD_SEPERATOR	= 2,
+	FORMAT_PRINTFMT		= 3,
 };
 
 static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_event_call *call = m->private;
 	struct ftrace_event_field *field;
-	struct list_head *head;
+	struct list_head *common_head = &ftrace_common_fields;
+	struct list_head *head = trace_get_fields(call);
 
 	(*pos)++;
 
 	switch ((unsigned long)v) {
 	case FORMAT_HEADER:
-		head = &ftrace_common_fields;
+		if (unlikely(list_empty(common_head)))
+			return NULL;
+
+		field = list_entry(common_head->prev,
+				   struct ftrace_event_field, link);
+		return field;
 
+	case FORMAT_FIELD_SEPERATOR:
 		if (unlikely(list_empty(head)))
 			return NULL;
 
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 		return NULL;
 	}
 
-	head = trace_get_fields(call);
-
-	/*
-	 * To separate common fields from event fields, the
-	 * LSB is set on the first event field. Clear it in case.
-	 */
-	v = (void *)((unsigned long)v & ~1L);
-
 	field = v;
-	/*
-	 * If this is a common field, and at the end of the list, then
-	 * continue with main list.
-	 */
-	if (field->link.prev == &ftrace_common_fields) {
-		if (unlikely(list_empty(head)))
-			return NULL;
-		field = list_entry(head->prev, struct ftrace_event_field, link);
-		/* Set the LSB to notify f_show to print an extra newline */
-		field = (struct ftrace_event_field *)
-			((unsigned long)field | 1);
-		return field;
-	}
-
-	/* If we are done tell f_show to print the format */
-	if (field->link.prev == head)
+	if (field->link.prev == common_head)
+		return (void *)FORMAT_FIELD_SEPERATOR;
+	else if (field->link.prev == head)
 		return (void *)FORMAT_PRINTFMT;
 
 	field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
 		seq_printf(m, "format:\n");
 		return 0;
 
+	case FORMAT_FIELD_SEPERATOR:
+		seq_putc(m, '\n');
+		return 0;
+
 	case FORMAT_PRINTFMT:
 		seq_printf(m, "\nprint fmt: %s\n",
 			   call->print_fmt);
 		return 0;
 	}
 
-	/*
-	 * To separate common fields from event fields, the
-	 * LSB is set on the first event field. Clear it and
-	 * print a newline if it is set.
-	 */
-	if ((unsigned long)v & 1) {
-		seq_putc(m, '\n');
-		v = (void *)((unsigned long)v & ~1L);
-	}
-
 	field = v;
 
 	/*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518..76b05980225 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
 #include "trace.h"
 #include "trace_output.h"
 
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
+
 struct fgraph_cpu_data {
 	pid_t		last_pid;
 	int		depth;
+	int		depth_irq;
 	int		ignore;
 	unsigned long	enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 
 struct fgraph_data {
-	struct fgraph_cpu_data		*cpu_data;
+	struct fgraph_cpu_data __percpu *cpu_data;
 
 	/* Place to preserve last processed entry. */
 	struct ftrace_graph_ent_entry	ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_PROC		0x8
 #define TRACE_GRAPH_PRINT_DURATION	0x10
 #define TRACE_GRAPH_PRINT_ABS_TIME	0x20
+#define TRACE_GRAPH_PRINT_IRQS		0x40
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
 	/* Display absolute time of an entry */
 	{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+	/* Display interrupts */
+	{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
 	/* Don't display overruns and proc by default */
 	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
-	       TRACE_GRAPH_PRINT_DURATION,
+	       TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
 	.opts = trace_opts
 };
 
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
 	return 1;
 }
 
+static inline int ftrace_graph_ignore_irqs(void)
+{
+	if (!ftrace_graph_skip_irqs)
+		return 0;
+
+	return in_irq();
+}
+
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 		return 0;
 
 	/* trace it when it is-nested-in or is a function enabled. */
-	if (!(trace->depth || ftrace_graph_addr(trace->func)))
+	if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+	      ftrace_graph_ignore_irqs())
 		return 0;
 
 	local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
 		return trace_graph_entry(trace);
 }
 
+static void
+__trace_graph_function(struct trace_array *tr,
+		unsigned long ip, unsigned long flags, int pc)
+{
+	u64 time = trace_clock_local();
+	struct ftrace_graph_ent ent = {
+		.func  = ip,
+		.depth = 0,
+	};
+	struct ftrace_graph_ret ret = {
+		.func     = ip,
+		.depth    = 0,
+		.calltime = time,
+		.rettime  = time,
+	};
+
+	__trace_graph_entry(tr, &ent, flags, pc);
+	__trace_graph_return(tr, &ret, flags, pc);
+}
+
+void
+trace_graph_function(struct trace_array *tr,
+		unsigned long ip, unsigned long parent_ip,
+		unsigned long flags, int pc)
+{
+	__trace_graph_function(tr, ip, flags, pc);
+}
+
 void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 	/* Print nsecs (we don't want to exceed 7 numbers) */
 	if (len < 7) {
-		snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
-			 nsecs_rem);
+		size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
+
+		snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
 		ret = trace_seq_printf(s, ".%s", nsecs_str);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 	return 0;
 }
 
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just extered irq code
+ *
+ * retunns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+		unsigned long addr, int depth)
+{
+	int cpu = iter->cpu;
+	int *depth_irq;
+	struct fgraph_data *data = iter->private;
+
+	/*
+	 * If we are either displaying irqs, or we got called as
+	 * a graph event and private data does not exist,
+	 * then we bypass the irq check.
+	 */
+	if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+	    (!data))
+		return 0;
+
+	depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+	/*
+	 * We are inside the irq code
+	 */
+	if (*depth_irq >= 0)
+		return 1;
+
+	if ((addr < (unsigned long)__irqentry_text_start) ||
+	    (addr >= (unsigned long)__irqentry_text_end))
+		return 0;
+
+	/*
+	 * We are entering irq code.
+	 */
+	*depth_irq = depth;
+	return 1;
+}
+
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just left irq code
+ *
+ * returns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+	int cpu = iter->cpu;
+	int *depth_irq;
+	struct fgraph_data *data = iter->private;
+
+	/*
+	 * If we are either displaying irqs, or we got called as
+	 * a graph event and private data does not exist,
+	 * then we bypass the irq check.
+	 */
+	if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+	    (!data))
+		return 0;
+
+	depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+	/*
+	 * We are not inside the irq code.
+	 */
+	if (*depth_irq == -1)
+		return 0;
+
+	/*
+	 * We are inside the irq code, and this is returning entry.
+	 * Let's not trace it and clear the entry depth, since
+	 * we are out of irq code.
+	 *
+	 * This condition ensures that we 'leave the irq code' once
+	 * we are out of the entry depth. Thus protecting us from
+	 * the RETURN entry loss.
+	 */
+	if (*depth_irq >= depth) {
+		*depth_irq = -1;
+		return 1;
+	}
+
+	/*
+	 * We are inside the irq code, and this is not the entry.
+	 */
+	return 1;
+}
+
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			struct trace_iterator *iter, u32 flags)
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	static enum print_line_t ret;
 	int cpu = iter->cpu;
 
+	if (check_irq_entry(iter, flags, call->func, call->depth))
+		return TRACE_TYPE_HANDLED;
+
 	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int ret;
 	int i;
 
+	if (check_irq_return(iter, flags, trace->depth))
+		return TRACE_TYPE_HANDLED;
+
 	if (data) {
 		struct fgraph_cpu_data *cpu_data;
 		int cpu = iter->cpu;
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
 
 
 enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	struct ftrace_graph_ent_entry *field;
 	struct fgraph_data *data = iter->private;
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 static enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-	return print_graph_function_flags(iter, tracer_flags.val);
+	return __print_graph_function_flags(iter, tracer_flags.val);
+}
+
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+					     u32 flags)
+{
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		flags |= TRACE_GRAPH_PRINT_DURATION;
+	else
+		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+	return __print_graph_function_flags(iter, flags);
 }
 
 static enum print_line_t
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
 	seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
 	int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
 
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
 	print_graph_headers_flags(s, tracer_flags.val);
 }
 
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+	struct trace_iterator *iter = s->private;
+
+	if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+		/* print nothing if the buffers are empty */
+		if (trace_empty(iter))
+			return;
+
+		print_trace_header(s, iter);
+		flags |= TRACE_GRAPH_PRINT_DURATION;
+	} else
+		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+	__print_graph_headers_flags(s, flags);
+}
+
 void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
 		pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
 		int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
 		int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+		int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
 		*pid = -1;
 		*depth = 0;
 		*ignore = 0;
+		*depth_irq = -1;
 	}
 
 	iter->private = data;
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
 	}
 }
 
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_GRAPH_PRINT_IRQS)
+		ftrace_graph_skip_irqs = !set;
+
+	return 0;
+}
+
 static struct trace_event_functions graph_functions = {
 	.trace		= print_graph_function_event,
 };
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
+	.set_flag	= func_graph_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_function_graph,
 #endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2..5cf8c602b88 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp	unsigned long max_sequence;
 
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ *            incremented.
+ *         0 if the trace is to be ignored, and data->disabled
+ *            is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
  */
-static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int func_prolog_dec(struct trace_array *tr,
+			   struct trace_array_cpu **data,
+			   unsigned long *flags)
 {
-	struct trace_array *tr = irqsoff_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
 	long disabled;
 	int cpu;
 
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	 */
 	cpu = raw_smp_processor_id();
 	if (likely(!per_cpu(tracing_cpu, cpu)))
-		return;
+		return 0;
 
-	local_save_flags(flags);
+	local_save_flags(*flags);
 	/* slight chance to get a false positive on tracing_cpu */
-	if (!irqs_disabled_flags(flags))
-		return;
+	if (!irqs_disabled_flags(*flags))
+		return 0;
 
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
+	*data = tr->data[cpu];
+	disabled = atomic_inc_return(&(*data)->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, ip, parent_ip, flags, preempt_count());
+		return 1;
+
+	atomic_dec(&(*data)->disabled);
+
+	return 0;
+}
+
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (!func_prolog_dec(tr, &data, &flags))
+		return;
+
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	atomic_dec(&data->disabled);
 }
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
 	int ret;
-	int cpu;
 	int pc;
 
-	cpu = raw_smp_processor_id();
-	if (likely(!per_cpu(tracing_cpu, cpu)))
+	if (!func_prolog_dec(tr, &data, &flags))
 		return 0;
 
-	local_save_flags(flags);
-	/* slight chance to get a false positive on tracing_cpu */
-	if (!irqs_disabled_flags(flags))
-		return 0;
-
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		ret = __trace_graph_entry(tr, trace, flags, pc);
-	} else
-		ret = 0;
-
+	pc = preempt_count();
+	ret = __trace_graph_entry(tr, trace, flags, pc);
 	atomic_dec(&data->disabled);
+
 	return ret;
 }
 
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
-	int cpu;
 	int pc;
 
-	cpu = raw_smp_processor_id();
-	if (likely(!per_cpu(tracing_cpu, cpu)))
+	if (!func_prolog_dec(tr, &data, &flags))
 		return;
 
-	local_save_flags(flags);
-	/* slight chance to get a false positive on tracing_cpu */
-	if (!irqs_disabled_flags(flags))
-		return;
-
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		__trace_graph_return(tr, trace, flags, pc);
-	}
-
+	pc = preempt_count();
+	__trace_graph_return(tr, trace, flags, pc);
 	atomic_dec(&data->disabled);
 }
 
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
 
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
-	u32 flags = GRAPH_TRACER_FLAGS;
-
-	if (trace_flags & TRACE_ITER_LATENCY_FMT)
-		flags |= TRACE_GRAPH_PRINT_DURATION;
-	else
-		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
 	/*
 	 * In graph mode call the graph tracer output function,
 	 * otherwise go with the TRACE_FN event handler
 	 */
 	if (is_graph())
-		return print_graph_function_flags(iter, flags);
+		return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
 
 	return TRACE_TYPE_UNHANDLED;
 }
 
 static void irqsoff_print_header(struct seq_file *s)
 {
-	if (is_graph()) {
-		struct trace_iterator *iter = s->private;
-		u32 flags = GRAPH_TRACER_FLAGS;
-
-		if (trace_flags & TRACE_ITER_LATENCY_FMT) {
-			/* print nothing if the buffers are empty */
-			if (trace_empty(iter))
-				return;
-
-			print_trace_header(s, iter);
-			flags |= TRACE_GRAPH_PRINT_DURATION;
-		} else
-			flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
-		print_graph_headers_flags(s, flags);
-	} else
+	if (is_graph())
+		print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+	else
 		trace_default_header(s);
 }
 
 static void
-trace_graph_function(struct trace_array *tr,
-		 unsigned long ip, unsigned long flags, int pc)
-{
-	u64 time = trace_clock_local();
-	struct ftrace_graph_ent ent = {
-		.func  = ip,
-		.depth = 0,
-	};
-	struct ftrace_graph_ret ret = {
-		.func     = ip,
-		.depth    = 0,
-		.calltime = time,
-		.rettime  = time,
-	};
-
-	__trace_graph_entry(tr, &ent, flags, pc);
-	__trace_graph_return(tr, &ret, flags, pc);
-}
-
-static void
 __trace_function(struct trace_array *tr,
 		 unsigned long ip, unsigned long parent_ip,
 		 unsigned long flags, int pc)
 {
-	if (!is_graph())
+	if (is_graph())
+		trace_graph_function(tr, ip, parent_ip, flags, pc);
+	else
 		trace_function(tr, ip, parent_ip, flags, pc);
-	else {
-		trace_graph_function(tr, parent_ip, flags, pc);
-		trace_graph_function(tr, ip, flags, pc);
-	}
 }
 
 #else
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81..7319559ed59 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int			wakeup_rt;
 static arch_spinlock_t wakeup_lock =
 	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
+static void wakeup_reset(struct trace_array *tr);
 static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
 
 static int save_lat_flag;
 
+#define TRACE_DISPLAY_GRAPH     1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/* display latency trace as call graph */
+	{ TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val  = 0,
+	.opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
 #ifdef CONFIG_FUNCTION_TRACER
+
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ *            is disabled and data->disabled is incremented.
+ *         0 if the trace is to be ignored, and preemption
+ *            is not disabled and data->disabled is
+ *            kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
  */
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int
+func_prolog_preempt_disable(struct trace_array *tr,
+			    struct trace_array_cpu **data,
+			    int *pc)
 {
-	struct trace_array *tr = wakeup_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
 	long disabled;
 	int cpu;
-	int pc;
 
 	if (likely(!wakeup_task))
-		return;
+		return 0;
 
-	pc = preempt_count();
+	*pc = preempt_count();
 	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
 	if (cpu != wakeup_current_cpu)
 		goto out_enable;
 
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
+	*data = tr->data[cpu];
+	disabled = atomic_inc_return(&(*data)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
-	local_irq_save(flags);
+	return 1;
 
-	trace_function(tr, ip, parent_ip, flags, pc);
+out:
+	atomic_dec(&(*data)->disabled);
+
+out_enable:
+	preempt_enable_notrace();
+	return 0;
+}
 
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	int pc;
+
+	if (!func_prolog_preempt_disable(tr, &data, &pc))
+		return;
+
+	local_irq_save(flags);
+	trace_function(tr, ip, parent_ip, flags, pc);
 	local_irq_restore(flags);
 
- out:
 	atomic_dec(&data->disabled);
- out_enable:
 	preempt_enable_notrace();
 }
 
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
 
+static int start_func_tracer(int graph)
+{
+	int ret;
+
+	if (!graph)
+		ret = register_ftrace_function(&trace_ops);
+	else
+		ret = register_ftrace_graph(&wakeup_graph_return,
+					    &wakeup_graph_entry);
+
+	if (!ret && tracing_is_enabled())
+		tracer_enabled = 1;
+	else
+		tracer_enabled = 0;
+
+	return ret;
+}
+
+static void stop_func_tracer(int graph)
+{
+	tracer_enabled = 0;
+
+	if (!graph)
+		unregister_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_graph();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+
+	if (!(bit & TRACE_DISPLAY_GRAPH))
+		return -EINVAL;
+
+	if (!(is_graph() ^ set))
+		return 0;
+
+	stop_func_tracer(!set);
+
+	wakeup_reset(wakeup_trace);
+	tracing_max_latency = 0;
+
+	return start_func_tracer(set);
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	int pc, ret = 0;
+
+	if (!func_prolog_preempt_disable(tr, &data, &pc))
+		return 0;
+
+	local_save_flags(flags);
+	ret = __trace_graph_entry(tr, trace, flags, pc);
+	atomic_dec(&data->disabled);
+	preempt_enable_notrace();
+
+	return ret;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	int pc;
+
+	if (!func_prolog_preempt_disable(tr, &data, &pc))
+		return;
+
+	local_save_flags(flags);
+	__trace_graph_return(tr, trace, flags, pc);
+	atomic_dec(&data->disabled);
+
+	preempt_enable_notrace();
+	return;
+}
+
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+	if (is_graph())
+		graph_trace_open(iter);
+}
+
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+	if (iter->private)
+		graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+	/*
+	 * In graph mode call the graph tracer output function,
+	 * otherwise go with the TRACE_FN event handler
+	 */
+	if (is_graph())
+		return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_print_header(struct seq_file *s)
+{
+	if (is_graph())
+		print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+	else
+		trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long parent_ip,
+		 unsigned long flags, int pc)
+{
+	if (is_graph())
+		trace_graph_function(tr, ip, parent_ip, flags, pc);
+	else
+		trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+	return -EINVAL;
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+	return -1;
+}
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 /*
  * Should this new latency be reported/recorded?
  */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
 	/* The task we are waiting for is waking up */
 	data = wakeup_trace->data[wakeup_cpu];
 
-	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 	 * is not called by an assembly function  (where as schedule is)
 	 * it should be safe to use it here.
 	 */
-	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
 	arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	 */
 	smp_wmb();
 
-	register_ftrace_function(&trace_ops);
-
-	if (tracing_is_enabled())
-		tracer_enabled = 1;
-	else
-		tracer_enabled = 0;
+	if (start_func_tracer(is_graph()))
+		printk(KERN_ERR "failed to start wakeup tracer\n");
 
 	return;
 fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
+	stop_func_tracer(is_graph());
 	unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
 	unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
 	unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
 	.print_max	= 1,
+	.print_header	= wakeup_print_header,
+	.print_line	= wakeup_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.open		= wakeup_trace_open,
+	.close		= wakeup_trace_close,
 	.use_max_tr	= 1,
 };
 
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 	.stop		= wakeup_tracer_stop,
 	.wait_pipe	= poll_wait_pipe,
 	.print_max	= 1,
+	.print_header	= wakeup_print_header,
+	.print_line	= wakeup_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.open		= wakeup_trace_open,
+	.close		= wakeup_trace_close,
 	.use_max_tr	= 1,
 };
 
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf..209b379a472 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
 {
 	int ret, cpu;
 
+	for_each_possible_cpu(cpu) {
+		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+	}
+
 	ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
 	if (ret)
 		goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
 	if (ret)
 		goto no_creation;
 
-	for_each_possible_cpu(cpu) {
-		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-	}
-
 	return 0;
 
 no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea2..e95ee7f31d4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/jump_label.h>
 
 extern struct tracepoint __start___tracepoints[];
 extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 	 * is used.
 	 */
 	rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-	elem->state = active;
+	if (!elem->state && active) {
+		jump_label_enable(&elem->state);
+		elem->state = active;
+	} else if (elem->state && !active) {
+		jump_label_disable(&elem->state);
+		elem->state = active;
+	}
 }
 
 /*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
 	if (elem->unregfunc && elem->state)
 		elem->unregfunc();
 
-	elem->state = 0;
+	if (elem->state) {
+		jump_label_disable(&elem->state);
+		elem->state = 0;
+	}
 	rcu_assign_pointer(elem->funcs, NULL);
 }
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9c3c52ecc..bafba687a6d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
 
-static int __read_mostly did_panic;
 static int __initdata no_watchdog;
 
 
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
 	return 0;
 }
 
-static int
-watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	did_panic = 1;
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block panic_block = {
-	.notifier_call = watchdog_panic,
-};
-
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
@@ -209,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
 };
 
 /* Callback function for perf event subsystem */
-void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event, int nmi,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
 {
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
 	/* Try to register using hardware perf events */
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period();
-	event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
 	if (!IS_ERR(event)) {
 		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
 		goto out_save;
 	}
 
 	printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
-	return -1;
+	return PTR_ERR(event);
 
 	/* success path */
 out_save:
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
 static int watchdog_enable(int cpu)
 {
 	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+	int err;
 
 	/* enable the perf event */
-	if (watchdog_nmi_enable(cpu) != 0)
-		return -1;
+	err = watchdog_nmi_enable(cpu);
+	if (err)
+		return err;
 
 	/* create the watchdog thread */
 	if (!p) {
 		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
 			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-			return -1;
+			return PTR_ERR(p);
 		}
 		kthread_bind(p, cpu);
 		per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
 {
 	int cpu;
 
+	if (no_watchdog)
+		return;
+
 	for_each_online_cpu(cpu)
 		watchdog_disable(cpu);
 
@@ -526,17 +518,16 @@ static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
+	int err = 0;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		if (watchdog_prepare_cpu(hotcpu))
-			return NOTIFY_BAD;
+		err = watchdog_prepare_cpu(hotcpu);
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		if (watchdog_enable(hotcpu))
-			return NOTIFY_BAD;
+		err = watchdog_enable(hotcpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 #endif /* CONFIG_HOTPLUG_CPU */
 	}
-	return NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void)
 		return 0;
 
 	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-	WARN_ON(err == NOTIFY_BAD);
+	WARN_ON(notifier_to_errno(err));
 
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
-	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
-
 	return 0;
 }
 early_initcall(spawn_watchdog_task);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1b4afd2e6ca..21ac83070a8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -482,6 +482,7 @@ config PROVE_LOCKING
 	select DEBUG_SPINLOCK
 	select DEBUG_MUTEXES
 	select DEBUG_LOCK_ALLOC
+	select TRACE_IRQFLAGS
 	default n
 	help
 	 This feature enables the kernel to prove that all locking
@@ -539,6 +540,23 @@ config PROVE_RCU_REPEATEDLY
 	 disabling, allowing multiple RCU-lockdep warnings to be printed
 	 on a single reboot.
 
+	 Say Y to allow multiple RCU-lockdep warnings per boot.
+
+	 Say N if you are unsure.
+
+config SPARSE_RCU_POINTER
+	bool "RCU debugging: sparse-based checks for pointer usage"
+	default n
+	help
+	 This feature enables the __rcu sparse annotation for
+	 RCU-protected pointers.  This annotation will cause sparse
+	 to flag any non-RCU used of annotated pointers.  This can be
+	 helpful when debugging RCU usage.  Please note that this feature
+	 is not intended to enforce code cleanliness; it is instead merely
+	 a debugging aid.
+
+	 Say Y to make sparse flag questionable use of RCU-protected pointers
+
 	 Say N if you are unsure.
 
 config LOCKDEP
@@ -579,11 +597,10 @@ config DEBUG_LOCKDEP
 	  of more runtime overhead.
 
 config TRACE_IRQFLAGS
-	depends on DEBUG_KERNEL
 	bool
-	default y
-	depends on TRACE_IRQFLAGS_SUPPORT
-	depends on PROVE_LOCKING
+	help
+	  Enables hooks to interrupt enabling and disabling for
+	  either tracing or lock debugging.
 
 config DEBUG_SPINLOCK_SLEEP
 	bool "Spinlock debugging: sleep-inside-spinlock checking"
@@ -832,6 +849,30 @@ config RCU_CPU_STALL_DETECTOR
 
 	  Say Y if you are unsure.
 
+config RCU_CPU_STALL_TIMEOUT
+	int "RCU CPU stall timeout in seconds"
+	depends on RCU_CPU_STALL_DETECTOR
+	range 3 300
+	default 60
+	help
+	  If a given RCU grace period extends more than the specified
+	  number of seconds, a CPU stall warning is printed.  If the
+	  RCU grace period persists, additional CPU stall warnings are
+	  printed at more widely spaced intervals.
+
+config RCU_CPU_STALL_DETECTOR_RUNNABLE
+	bool "RCU CPU stall checking starts automatically at boot"
+	depends on RCU_CPU_STALL_DETECTOR
+	default y
+	help
+	  If set, start checking for RCU CPU stalls immediately on
+	  boot.  Otherwise, RCU CPU stall checking must be manually
+	  enabled.
+
+	  Say Y if you are unsure.
+
+	  Say N if you wish to suppress RCU CPU stall checking during boot.
+
 config RCU_CPU_STALL_VERBOSE
 	bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR"
 	depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 02afc253372..7bd6df781ce 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -26,19 +26,11 @@
 #include <linux/dynamic_debug.h>
 #include <linux/debugfs.h>
 #include <linux/slab.h>
+#include <linux/jump_label.h>
 
 extern struct _ddebug __start___verbose[];
 extern struct _ddebug __stop___verbose[];
 
-/* dynamic_debug_enabled, and dynamic_debug_enabled2 are bitmasks in which
- * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
- * use independent hash functions, to reduce the chance of false positives.
- */
-long long dynamic_debug_enabled;
-EXPORT_SYMBOL_GPL(dynamic_debug_enabled);
-long long dynamic_debug_enabled2;
-EXPORT_SYMBOL_GPL(dynamic_debug_enabled2);
-
 struct ddebug_table {
 	struct list_head link;
 	char *mod_name;
@@ -88,26 +80,6 @@ static char *ddebug_describe_flags(struct _ddebug *dp, char *buf,
 }
 
 /*
- * must be called with ddebug_lock held
- */
-
-static int disabled_hash(char hash, bool first_table)
-{
-	struct ddebug_table *dt;
-	char table_hash_value;
-
-	list_for_each_entry(dt, &ddebug_tables, link) {
-		if (first_table)
-			table_hash_value = dt->ddebugs->primary_hash;
-		else
-			table_hash_value = dt->ddebugs->secondary_hash;
-		if (dt->num_enabled && (hash == table_hash_value))
-			return 0;
-	}
-	return 1;
-}
-
-/*
  * Search the tables for _ddebug's which match the given
  * `query' and apply the `flags' and `mask' to them.  Tells
  * the user which ddebug's were changed, or whether none
@@ -170,17 +142,9 @@ static void ddebug_change(const struct ddebug_query *query,
 				dt->num_enabled++;
 			dp->flags = newflags;
 			if (newflags) {
-				dynamic_debug_enabled |=
-						(1LL << dp->primary_hash);
-				dynamic_debug_enabled2 |=
-						(1LL << dp->secondary_hash);
+				jump_label_enable(&dp->enabled);
 			} else {
-				if (disabled_hash(dp->primary_hash, true))
-					dynamic_debug_enabled &=
-						~(1LL << dp->primary_hash);
-				if (disabled_hash(dp->secondary_hash, false))
-					dynamic_debug_enabled2 &=
-						~(1LL << dp->secondary_hash);
+				jump_label_disable(&dp->enabled);
 			}
 			if (verbose)
 				printk(KERN_INFO
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index efd16fa80b1..6f412ab4c24 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -49,7 +49,7 @@ struct radix_tree_node {
 	unsigned int	height;		/* Height from the bottom */
 	unsigned int	count;
 	struct rcu_head	rcu_head;
-	void		*slots[RADIX_TREE_MAP_SIZE];
+	void __rcu	*slots[RADIX_TREE_MAP_SIZE];
 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 34e3082632d..7c06ee51a29 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -70,7 +70,7 @@ static unsigned long io_tlb_nslabs;
  */
 static unsigned long io_tlb_overflow = 32*1024;
 
-void *io_tlb_overflow_buffer;
+static void *io_tlb_overflow_buffer;
 
 /*
  * This is a free list describing the number of free entries available from
@@ -147,16 +147,16 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
 	 * between io_tlb_start and io_tlb_end.
 	 */
-	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+	io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
 	for (i = 0; i < io_tlb_nslabs; i++)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
 	io_tlb_index = 0;
-	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
+	io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
 
 	/*
 	 * Get the overflow emergency buffer
 	 */
-	io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+	io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
 	if (!io_tlb_overflow_buffer)
 		panic("Cannot allocate SWIOTLB overflow buffer!\n");
 	if (verbose)
@@ -182,7 +182,7 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	io_tlb_start = alloc_bootmem_low_pages(bytes);
+	io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
 	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 
@@ -308,13 +308,13 @@ void __init swiotlb_free(void)
 			   get_order(io_tlb_nslabs << IO_TLB_SHIFT));
 	} else {
 		free_bootmem_late(__pa(io_tlb_overflow_buffer),
-				  io_tlb_overflow);
+				  PAGE_ALIGN(io_tlb_overflow));
 		free_bootmem_late(__pa(io_tlb_orig_addr),
-				  io_tlb_nslabs * sizeof(phys_addr_t));
+				  PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
 		free_bootmem_late(__pa(io_tlb_list),
-				  io_tlb_nslabs * sizeof(int));
+				  PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
 		free_bootmem_late(__pa(io_tlb_start),
-				  io_tlb_nslabs << IO_TLB_SHIFT);
+				  PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
 	}
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 0e18b4d649e..98b58fecede 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3185,7 +3185,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 		 * with threads.
 		 */
 		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_page(vma, address);
+			flush_tlb_fix_spurious_fault(vma, address);
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a..d8087f0db50 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
 static void purge_fragmented_blocks_allcpus(void);
 
 /*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+	atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
+/*
  * Purges all lazily-freed vmap areas.
  *
  * If sync is 0 then don't purge if there is already a purge in progress.
diff --git a/net/Kconfig b/net/Kconfig
index e926884c167..55fd82e9ffd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
 source "net/rfkill/Kconfig"
 source "net/9p/Kconfig"
 source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
 
 
 endif   # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1..6b7bfd7f141 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
 obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
+obj-$(CONFIG_CEPH_LIB)		+= ceph/
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 00000000000..ad424049b0c
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
+config CEPH_LIB
+        tristate "Ceph core library (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	default n
+	help
+	  Choose Y or M here to include cephlib, which provides the
+	  common functionality to both the Ceph filesystem and
+	  to the rados block device (rbd).
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+	bool "Include file:line in ceph debug output"
+	depends on CEPH_LIB
+	default n
+	help
+	  If you say Y here, debug output will include a filename and
+	  line to aid debugging.  This increases kernel size and slows
+	  execution slightly when debug call sites are enabled (e.g.,
+	  via CONFIG_DYNAMIC_DEBUG).
+
+	  If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 00000000000..aab1cabb803
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+	mon_client.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+	debugfs.o \
+	auth.o auth_none.o \
+	crypto.o armor.o \
+	auth_x.o \
+	ceph_fs.o ceph_strings.o ceph_hash.o \
+	pagevec.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
+
+modules_install:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
+
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/armor.c b/net/ceph/armor.c
index eb2a666b0be..eb2a666b0be 100644
--- a/fs/ceph/armor.c
+++ b/net/ceph/armor.c
diff --git a/fs/ceph/auth.c b/net/ceph/auth.c
index 6d2e3060062..549c1f43e1d 100644
--- a/fs/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -1,16 +1,16 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 
-#include "types.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
 #include "auth_none.h"
 #include "auth_x.h"
-#include "decode.h"
-#include "super.h"
 
-#include "messenger.h"
 
 /*
  * get protocol handler
diff --git a/fs/ceph/auth_none.c b/net/ceph/auth_none.c
index ad1dc21286c..214c2bb43d6 100644
--- a/fs/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -1,14 +1,15 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
 #include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
 
 static void reset(struct ceph_auth_client *ac)
 {
diff --git a/fs/ceph/auth_none.h b/net/ceph/auth_none.h
index 8164df1a08b..ed7d088b1bc 100644
--- a/fs/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -2,8 +2,7 @@
 #define _FS_CEPH_AUTH_NONE_H
 
 #include <linux/slab.h>
-
-#include "auth.h"
+#include <linux/ceph/auth.h>
 
 /*
  * null security mode.
diff --git a/fs/ceph/auth_x.c b/net/ceph/auth_x.c
index a2d002cbdec..7fd5dfcf6e1 100644
--- a/fs/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -1,16 +1,17 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
 #include "auth_x.h"
 #include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
 
 #define TEMP_TICKET_BUF_LEN	256
 
diff --git a/fs/ceph/auth_x.h b/net/ceph/auth_x.h
index ff6f8180e68..e02da7a5c5a 100644
--- a/fs/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -3,8 +3,9 @@
 
 #include <linux/rbtree.h>
 
+#include <linux/ceph/auth.h>
+
 #include "crypto.h"
-#include "auth.h"
 #include "auth_x_protocol.h"
 
 /*
diff --git a/fs/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 671d30576c4..671d30576c4 100644
--- a/fs/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
diff --git a/fs/ceph/buffer.c b/net/ceph/buffer.c
index cd39f17021d..53d8abfa25d 100644
--- a/fs/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -1,10 +1,11 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/module.h>
 #include <linux/slab.h>
 
-#include "buffer.h"
-#include "decode.h"
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
 
 struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
 {
@@ -32,6 +33,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
 	dout("buffer_new %p\n", b);
 	return b;
 }
+EXPORT_SYMBOL(ceph_buffer_new);
 
 void ceph_buffer_release(struct kref *kref)
 {
@@ -46,6 +48,7 @@ void ceph_buffer_release(struct kref *kref)
 	}
 	kfree(b);
 }
+EXPORT_SYMBOL(ceph_buffer_release);
 
 int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
 {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 00000000000..f3e4a13fea0
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+	const char *e = s + len;
+
+	while (e != s && *(e-1) != '/')
+		e--;
+	return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+	switch (type) {
+	case CEPH_MSG_SHUTDOWN: return "shutdown";
+	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_AUTH: return "auth";
+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+	case CEPH_MSG_MON_MAP: return "mon_map";
+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+	case CEPH_MSG_STATFS: return "statfs";
+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MDS_MAP: return "mds_map";
+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+	case CEPH_MSG_OSD_MAP: return "osd_map";
+	case CEPH_MSG_OSD_OP: return "osd_op";
+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+	default: return "unknown";
+	}
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+	if (client->have_fsid) {
+		if (ceph_fsid_compare(&client->fsid, fsid)) {
+			pr_err("bad fsid, had %pU got %pU",
+			       &client->fsid, fsid);
+			return -1;
+		}
+	} else {
+		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
+		memcpy(&client->fsid, fsid, sizeof(*fsid));
+		ceph_debugfs_client_init(client);
+		client->have_fsid = true;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+			 struct ceph_client *client)
+{
+	struct ceph_options *opt1 = new_opt;
+	struct ceph_options *opt2 = client->options;
+	int ofs = offsetof(struct ceph_options, mon_addr);
+	int i;
+	int ret;
+
+	ret = memcmp(opt1, opt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(opt1->name, opt2->name);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(opt1->secret, opt2->secret);
+	if (ret)
+		return ret;
+
+	/* any matching mon ip implies a match */
+	for (i = 0; i < opt1->num_mon; i++) {
+		if (ceph_monmap_contains(client->monc.monmap,
+				 &opt1->mon_addr[i]))
+			return 0;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+	int i = 0;
+	char tmp[3];
+	int err = -EINVAL;
+	int d;
+
+	dout("parse_fsid '%s'\n", str);
+	tmp[2] = 0;
+	while (*str && i < 16) {
+		if (ispunct(*str)) {
+			str++;
+			continue;
+		}
+		if (!isxdigit(str[0]) || !isxdigit(str[1]))
+			break;
+		tmp[0] = str[0];
+		tmp[1] = str[1];
+		if (sscanf(tmp, "%x", &d) < 1)
+			break;
+		fsid->fsid[i] = d & 0xff;
+		i++;
+		str += 2;
+	}
+
+	if (i == 16)
+		err = 0;
+	dout("parse_fsid ret %d got fsid %pU", err, fsid);
+	return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+	Opt_osdtimeout,
+	Opt_osdkeepalivetimeout,
+	Opt_mount_timeout,
+	Opt_osd_idle_ttl,
+	Opt_last_int,
+	/* int args above */
+	Opt_fsid,
+	Opt_name,
+	Opt_secret,
+	Opt_ip,
+	Opt_last_string,
+	/* string args above */
+	Opt_noshare,
+	Opt_nocrc,
+};
+
+static match_table_t opt_tokens = {
+	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	/* int args above */
+	{Opt_fsid, "fsid=%s"},
+	{Opt_name, "name=%s"},
+	{Opt_secret, "secret=%s"},
+	{Opt_ip, "ip=%s"},
+	/* string args above */
+	{Opt_noshare, "noshare"},
+	{Opt_nocrc, "nocrc"},
+	{-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+	dout("destroy_options %p\n", opt);
+	kfree(opt->name);
+	kfree(opt->secret);
+	kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+int ceph_parse_options(struct ceph_options **popt, char *options,
+		       const char *dev_name, const char *dev_name_end,
+		       int (*parse_extra_token)(char *c, void *private),
+		       void *private)
+{
+	struct ceph_options *opt;
+	const char *c;
+	int err = -ENOMEM;
+	substring_t argstr[MAX_OPT_ARGS];
+
+	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+	if (!opt)
+		return err;
+	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+				GFP_KERNEL);
+	if (!opt->mon_addr)
+		goto out;
+
+	dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+	     dev_name);
+
+	/* start with defaults */
+	opt->flags = CEPH_OPT_DEFAULT;
+	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+
+	/* get mon ip(s) */
+	/* ip1[:port1][,ip2[:port2]...] */
+	err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+			     CEPH_MAX_MON, &opt->num_mon);
+	if (err < 0)
+		goto out;
+
+	/* parse mount options */
+	while ((c = strsep(&options, ",")) != NULL) {
+		int token, intval, ret;
+		if (!*c)
+			continue;
+		err = -EINVAL;
+		token = match_token((char *)c, opt_tokens, argstr);
+		if (token < 0 && parse_extra_token) {
+			/* extra? */
+			err = parse_extra_token((char *)c, private);
+			if (err < 0) {
+				pr_err("bad option at '%s'\n", c);
+				goto out;
+			}
+			continue;
+		}
+		if (token < Opt_last_int) {
+			ret = match_int(&argstr[0], &intval);
+			if (ret < 0) {
+				pr_err("bad mount option arg (not int) "
+				       "at '%s'\n", c);
+				continue;
+			}
+			dout("got int token %d val %d\n", token, intval);
+		} else if (token > Opt_last_int && token < Opt_last_string) {
+			dout("got string token %d val %s\n", token,
+			     argstr[0].from);
+		} else {
+			dout("got token %d\n", token);
+		}
+		switch (token) {
+		case Opt_ip:
+			err = ceph_parse_ips(argstr[0].from,
+					     argstr[0].to,
+					     &opt->my_addr,
+					     1, NULL);
+			if (err < 0)
+				goto out;
+			opt->flags |= CEPH_OPT_MYIP;
+			break;
+
+		case Opt_fsid:
+			err = parse_fsid(argstr[0].from, &opt->fsid);
+			if (err == 0)
+				opt->flags |= CEPH_OPT_FSID;
+			break;
+		case Opt_name:
+			opt->name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_secret:
+			opt->secret = kstrndup(argstr[0].from,
+						argstr[0].to-argstr[0].from,
+						GFP_KERNEL);
+			break;
+
+			/* misc */
+		case Opt_osdtimeout:
+			opt->osd_timeout = intval;
+			break;
+		case Opt_osdkeepalivetimeout:
+			opt->osd_keepalive_timeout = intval;
+			break;
+		case Opt_osd_idle_ttl:
+			opt->osd_idle_ttl = intval;
+			break;
+		case Opt_mount_timeout:
+			opt->mount_timeout = intval;
+			break;
+
+		case Opt_noshare:
+			opt->flags |= CEPH_OPT_NOSHARE;
+			break;
+
+		case Opt_nocrc:
+			opt->flags |= CEPH_OPT_NOCRC;
+			break;
+
+		default:
+			BUG_ON(token);
+		}
+	}
+
+	/* success */
+	*popt = opt;
+	return 0;
+
+out:
+	ceph_destroy_options(opt);
+	return err;
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+	return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+	struct ceph_client *client;
+	int err = -ENOMEM;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	client->private = private;
+	client->options = opt;
+
+	mutex_init(&client->mount_mutex);
+	init_waitqueue_head(&client->auth_wq);
+	client->auth_err = 0;
+
+	client->extra_mon_dispatch = NULL;
+	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
+	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
+
+	client->msgr = NULL;
+
+	/* subsystems */
+	err = ceph_monc_init(&client->monc, client);
+	if (err < 0)
+		goto fail;
+	err = ceph_osdc_init(&client->osdc, client);
+	if (err < 0)
+		goto fail_monc;
+
+	return client;
+
+fail_monc:
+	ceph_monc_stop(&client->monc);
+fail:
+	kfree(client);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+	dout("destroy_client %p\n", client);
+
+	/* unmount */
+	ceph_osdc_stop(&client->osdc);
+
+	/*
+	 * make sure mds and osd connections close out before destroying
+	 * the auth module, which is needed to free those connections'
+	 * ceph_authorizers.
+	 */
+	ceph_msgr_flush();
+
+	ceph_monc_stop(&client->monc);
+
+	ceph_debugfs_client_cleanup(client);
+
+	if (client->msgr)
+		ceph_messenger_destroy(client->msgr);
+
+	ceph_destroy_options(client->options);
+
+	kfree(client);
+	dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+	return client->monc.monmap && client->monc.monmap->epoch &&
+	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+	struct ceph_entity_addr *myaddr = NULL;
+	int err;
+	unsigned long timeout = client->options->mount_timeout * HZ;
+
+	/* initialize the messenger */
+	if (client->msgr == NULL) {
+		if (ceph_test_opt(client, MYIP))
+			myaddr = &client->options->my_addr;
+		client->msgr = ceph_messenger_create(myaddr,
+					client->supported_features,
+					client->required_features);
+		if (IS_ERR(client->msgr)) {
+			client->msgr = NULL;
+			return PTR_ERR(client->msgr);
+		}
+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	}
+
+	/* open session, and wait for mon and osd maps */
+	err = ceph_monc_open_session(&client->monc);
+	if (err < 0)
+		return err;
+
+	while (!have_mon_and_osd_map(client)) {
+		err = -EIO;
+		if (timeout && time_after_eq(jiffies, started + timeout))
+			return err;
+
+		/* wait */
+		dout("mount waiting for mon_map\n");
+		err = wait_event_interruptible_timeout(client->auth_wq,
+			have_mon_and_osd_map(client) || (client->auth_err < 0),
+			timeout);
+		if (err == -EINTR || err == -ERESTARTSYS)
+			return err;
+		if (client->auth_err < 0)
+			return client->auth_err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+	int ret;
+	unsigned long started = jiffies;  /* note the start time */
+
+	dout("open_session start\n");
+	mutex_lock(&client->mount_mutex);
+
+	ret = __ceph_open_session(client, started);
+
+	mutex_unlock(&client->mount_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+	int ret = 0;
+
+	ret = ceph_debugfs_init();
+	if (ret < 0)
+		goto out;
+
+	ret = ceph_msgr_init();
+	if (ret < 0)
+		goto out_debugfs;
+
+	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+
+	return 0;
+
+out_debugfs:
+	ceph_debugfs_cleanup();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+	dout("exit_ceph_lib\n");
+	ceph_msgr_exit();
+	ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 3ac6cc7c115..a3a3a31d3c3 100644
--- a/fs/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -1,7 +1,8 @@
 /*
  * Some non-inline ceph helpers
  */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 
 /*
  * return true if @layout appears to be valid
@@ -52,6 +53,7 @@ int ceph_flags_to_mode(int flags)
 
 	return mode;
 }
+EXPORT_SYMBOL(ceph_flags_to_mode);
 
 int ceph_caps_for_mode(int mode)
 {
@@ -70,3 +72,4 @@ int ceph_caps_for_mode(int mode)
 
 	return caps;
 }
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/fs/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
index bd570015d14..815ef882679 100644
--- a/fs/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -1,5 +1,5 @@
 
-#include "types.h"
+#include <linux/ceph/types.h>
 
 /*
  * Robert Jenkin's hash function.
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 00000000000..3fbda04de29
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_READ: return "read";
+	case CEPH_OSD_OP_STAT: return "stat";
+
+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+	case CEPH_OSD_OP_WRITE: return "write";
+	case CEPH_OSD_OP_DELETE: return "delete";
+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
+	case CEPH_OSD_OP_ZERO: return "zero";
+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
+	case CEPH_OSD_OP_ROLLBACK: return "rollback";
+
+	case CEPH_OSD_OP_APPEND: return "append";
+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+	case CEPH_OSD_OP_PULL: return "pull";
+	case CEPH_OSD_OP_PUSH: return "push";
+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+	case CEPH_OSD_OP_SCRUB: return "scrub";
+
+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+	case CEPH_OSD_OP_UPLOCK: return "uplock";
+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+	case CEPH_OSD_OP_CALL: return "call";
+
+	case CEPH_OSD_OP_PGLS: return "pgls";
+	}
+	return "???";
+}
+
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/net/ceph/crush/crush.c
index fabd302e577..d6ebb13a18a 100644
--- a/fs/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -8,7 +8,7 @@
 # define BUG_ON(x) assert(!(x))
 #endif
 
-#include "crush.h"
+#include <linux/crush/crush.h>
 
 const char *crush_bucket_alg_name(int alg)
 {
diff --git a/fs/ceph/crush/hash.c b/net/ceph/crush/hash.c
index 5873aed694b..5bb63e37a8a 100644
--- a/fs/ceph/crush/hash.c
+++ b/net/ceph/crush/hash.c
@@ -1,6 +1,6 @@
 
 #include <linux/types.h>
-#include "hash.h"
+#include <linux/crush/hash.h>
 
 /*
  * Robert Jenkins' function for mixing 32-bit values
diff --git a/fs/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index a4eec133258..42599e31dca 100644
--- a/fs/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -18,8 +18,8 @@
 # define kfree(x) free(x)
 #endif
 
-#include "crush.h"
-#include "hash.h"
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
 
 /*
  * Implement the core CRUSH mapping algorithm.
diff --git a/fs/ceph/crypto.c b/net/ceph/crypto.c
index a3e627f6329..7b505b0c983 100644
--- a/fs/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -1,13 +1,13 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/err.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <crypto/hash.h>
 
+#include <linux/ceph/decode.h>
 #include "crypto.h"
-#include "decode.h"
 
 int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
 {
diff --git a/fs/ceph/crypto.h b/net/ceph/crypto.h
index bdf38607323..f9eccace592 100644
--- a/fs/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -1,8 +1,8 @@
 #ifndef _FS_CEPH_CRYPTO_H
 #define _FS_CEPH_CRYPTO_H
 
-#include "types.h"
-#include "buffer.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
 
 /*
  * cryptographic secret
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 00000000000..27d4ea315d1
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->monc.monmap == NULL)
+		return 0;
+
+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
+		struct ceph_entity_inst *inst =
+			&client->monc.monmap->mon_inst[i];
+
+		seq_printf(s, "\t%s%lld\t%s\n",
+			   ENTITY_NAME(inst->name),
+			   ceph_pr_addr(&inst->addr.in_addr));
+	}
+	return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+	struct rb_node *n;
+
+	if (client->osdc.osdmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+	seq_printf(s, "flags%s%s\n",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+		   " NEARFULL" : "",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+		   " FULL" : "");
+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pool =
+			rb_entry(n, struct ceph_pg_pool_info, node);
+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
+			   pool->v.lpg_num, pool->lpg_num_mask);
+	}
+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+		struct ceph_entity_addr *addr =
+			&client->osdc.osdmap->osd_addr[i];
+		int state = client->osdc.osdmap->osd_state[i];
+		char sb[64];
+
+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+			   i, ceph_pr_addr(&addr->in_addr),
+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+	}
+	return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_client *monc = &client->monc;
+	struct rb_node *rp;
+
+	mutex_lock(&monc->mutex);
+
+	if (monc->have_mdsmap)
+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+	if (monc->have_osdmap)
+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+	if (monc->want_next_osdmap)
+		seq_printf(s, "want next osdmap\n");
+
+	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+		__u16 op;
+		req = rb_entry(rp, struct ceph_mon_generic_request, node);
+		op = le16_to_cpu(req->request->hdr.type);
+		if (op == CEPH_MSG_STATFS)
+			seq_printf(s, "%lld statfs\n", req->tid);
+		else
+			seq_printf(s, "%lld unknown\n", req->tid);
+	}
+
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *p;
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		struct ceph_osd_request *req;
+		struct ceph_osd_request_head *head;
+		struct ceph_osd_op *op;
+		int num_ops;
+		int opcode, olen;
+		int i;
+
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1,
+			   le32_to_cpu(req->r_pgid.pool),
+			   le16_to_cpu(req->r_pgid.ps));
+
+		head = req->r_request->front.iov_base;
+		op = (void *)(head + 1);
+
+		num_ops = le16_to_cpu(head->num_ops);
+		olen = le32_to_cpu(head->object_len);
+		seq_printf(s, "%.*s", olen,
+			   (const char *)(head->ops + num_ops));
+
+		if (req->r_reassert_version.epoch)
+			seq_printf(s, "\t%u'%llu",
+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+			   le64_to_cpu(req->r_reassert_version.version));
+		else
+			seq_printf(s, "\t");
+
+		for (i = 0; i < num_ops; i++) {
+			opcode = le16_to_cpu(op->op);
+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+			op++;
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&osdc->request_mutex);
+	return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+
+int ceph_debugfs_init(void)
+{
+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+	if (!ceph_debugfs_dir)
+		return -ENOMEM;
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+	debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	int ret = -ENOMEM;
+	char name[80];
+
+	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+		 client->monc.auth->global_id);
+
+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+	if (!client->debugfs_dir)
+		goto out;
+
+	client->monc.debugfs_file = debugfs_create_file("monc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &monc_show_fops);
+	if (!client->monc.debugfs_file)
+		goto out;
+
+	client->osdc.debugfs_file = debugfs_create_file("osdc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &osdc_show_fops);
+	if (!client->osdc.debugfs_file)
+		goto out;
+
+	client->debugfs_monmap = debugfs_create_file("monmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&monmap_show_fops);
+	if (!client->debugfs_monmap)
+		goto out;
+
+	client->debugfs_osdmap = debugfs_create_file("osdmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&osdmap_show_fops);
+	if (!client->debugfs_osdmap)
+		goto out;
+
+	return 0;
+
+out:
+	ceph_debugfs_client_cleanup(client);
+	return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+	debugfs_remove(client->debugfs_osdmap);
+	debugfs_remove(client->debugfs_monmap);
+	debugfs_remove(client->osdc.debugfs_file);
+	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_dir);
+}
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/fs/ceph/messenger.c b/net/ceph/messenger.c
index 2502d76fcec..0e8157ee5d4 100644
--- a/fs/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/crc32c.h>
 #include <linux/ctype.h>
@@ -9,12 +9,14 @@
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <net/tcp.h>
 
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
 
 /*
  * Ceph uses the messenger to exchange ceph_msg messages with other
@@ -48,7 +50,7 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
 static DEFINE_SPINLOCK(addr_str_lock);
 static int last_addr_str;
 
-const char *pr_addr(const struct sockaddr_storage *ss)
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
 {
 	int i;
 	char *s;
@@ -79,6 +81,7 @@ const char *pr_addr(const struct sockaddr_storage *ss)
 
 	return s;
 }
+EXPORT_SYMBOL(ceph_pr_addr);
 
 static void encode_my_addr(struct ceph_messenger *msgr)
 {
@@ -91,7 +94,7 @@ static void encode_my_addr(struct ceph_messenger *msgr)
  */
 struct workqueue_struct *ceph_msgr_wq;
 
-int __init ceph_msgr_init(void)
+int ceph_msgr_init(void)
 {
 	ceph_msgr_wq = create_workqueue("ceph-msgr");
 	if (IS_ERR(ceph_msgr_wq)) {
@@ -102,16 +105,19 @@ int __init ceph_msgr_init(void)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(ceph_msgr_init);
 
 void ceph_msgr_exit(void)
 {
 	destroy_workqueue(ceph_msgr_wq);
 }
+EXPORT_SYMBOL(ceph_msgr_exit);
 
 void ceph_msgr_flush(void)
 {
 	flush_workqueue(ceph_msgr_wq);
 }
+EXPORT_SYMBOL(ceph_msgr_flush);
 
 
 /*
@@ -221,19 +227,19 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 
 	set_sock_callbacks(sock, con);
 
-	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+	dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
 
 	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
 				 O_NONBLOCK);
 	if (ret == -EINPROGRESS) {
 		dout("connect %s EINPROGRESS sk_state = %u\n",
-		     pr_addr(&con->peer_addr.in_addr),
+		     ceph_pr_addr(&con->peer_addr.in_addr),
 		     sock->sk->sk_state);
 		ret = 0;
 	}
 	if (ret < 0) {
 		pr_err("connect %s error %d\n",
-		       pr_addr(&con->peer_addr.in_addr), ret);
+		       ceph_pr_addr(&con->peer_addr.in_addr), ret);
 		sock_release(sock);
 		con->sock = NULL;
 		con->error_msg = "connect error";
@@ -334,7 +340,8 @@ static void reset_connection(struct ceph_connection *con)
  */
 void ceph_con_close(struct ceph_connection *con)
 {
-	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+	dout("con_close %p peer %s\n", con,
+	     ceph_pr_addr(&con->peer_addr.in_addr));
 	set_bit(CLOSED, &con->state);  /* in case there's queued work */
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
 	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
@@ -347,19 +354,21 @@ void ceph_con_close(struct ceph_connection *con)
 	mutex_unlock(&con->mutex);
 	queue_con(con);
 }
+EXPORT_SYMBOL(ceph_con_close);
 
 /*
  * Reopen a closed connection, with a new peer address.
  */
 void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
 {
-	dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
 	set_bit(OPENING, &con->state);
 	clear_bit(CLOSED, &con->state);
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
 	con->delay = 0;      /* reset backoff memory */
 	queue_con(con);
 }
+EXPORT_SYMBOL(ceph_con_open);
 
 /*
  * return true if this connection ever successfully opened
@@ -406,6 +415,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, con_work);
 }
+EXPORT_SYMBOL(ceph_con_init);
 
 
 /*
@@ -529,8 +539,11 @@ static void prepare_write_message(struct ceph_connection *con)
 	if (le32_to_cpu(m->hdr.data_len) > 0) {
 		/* initialize page iterator */
 		con->out_msg_pos.page = 0;
-		con->out_msg_pos.page_pos =
-			le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		if (m->pages)
+			con->out_msg_pos.page_pos =
+				le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		else
+			con->out_msg_pos.page_pos = 0;
 		con->out_msg_pos.data_pos = 0;
 		con->out_msg_pos.did_page_crc = 0;
 		con->out_more = 1;  /* data + footer will follow */
@@ -647,7 +660,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
 	     con->connect_seq, global_seq, proto);
 
-	con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
+	con->out_connect.features = cpu_to_le64(msgr->supported_features);
 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -712,6 +725,31 @@ out:
 	return ret;  /* done! */
 }
 
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+	if (!bio) {
+		*iter = NULL;
+		*seg = 0;
+		return;
+	}
+	*iter = bio;
+	*seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+	if (*bio_iter == NULL)
+		return;
+
+	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+	(*seg)++;
+	if (*seg == (*bio_iter)->bi_vcnt)
+		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
 /*
  * Write as much message data payload as we can.  If we finish, queue
  * up the footer.
@@ -726,21 +764,46 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	size_t len;
 	int crc = con->msgr->nocrc;
 	int ret;
+	int total_max_write;
+	int in_trail = 0;
+	size_t trail_len = (msg->trail ? msg->trail->length : 0);
 
 	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
 	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
 	     con->out_msg_pos.page_pos);
 
-	while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+#ifdef CONFIG_BLOCK
+	if (msg->bio && !msg->bio_iter)
+		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+	while (data_len > con->out_msg_pos.data_pos) {
 		struct page *page = NULL;
 		void *kaddr = NULL;
+		int max_write = PAGE_SIZE;
+		int page_shift = 0;
+
+		total_max_write = data_len - trail_len -
+			con->out_msg_pos.data_pos;
 
 		/*
 		 * if we are calculating the data crc (the default), we need
 		 * to map the page.  if our pages[] has been revoked, use the
 		 * zero page.
 		 */
-		if (msg->pages) {
+
+		/* have we reached the trail part of the data? */
+		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+			in_trail = 1;
+
+			total_max_write = data_len - con->out_msg_pos.data_pos;
+
+			page = list_first_entry(&msg->trail->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+			max_write = PAGE_SIZE;
+		} else if (msg->pages) {
 			page = msg->pages[con->out_msg_pos.page];
 			if (crc)
 				kaddr = kmap(page);
@@ -749,13 +812,25 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 						struct page, lru);
 			if (crc)
 				kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+		} else if (msg->bio) {
+			struct bio_vec *bv;
+
+			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+			page = bv->bv_page;
+			page_shift = bv->bv_offset;
+			if (crc)
+				kaddr = kmap(page) + page_shift;
+			max_write = bv->bv_len;
+#endif
 		} else {
 			page = con->msgr->zero_page;
 			if (crc)
 				kaddr = page_address(con->msgr->zero_page);
 		}
-		len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
-			  (int)(data_len - con->out_msg_pos.data_pos));
+		len = min_t(int, max_write - con->out_msg_pos.page_pos,
+			    total_max_write);
+
 		if (crc && !con->out_msg_pos.did_page_crc) {
 			void *base = kaddr + con->out_msg_pos.page_pos;
 			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
@@ -765,13 +840,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 				cpu_to_le32(crc32c(tmpcrc, base, len));
 			con->out_msg_pos.did_page_crc = 1;
 		}
-
 		ret = kernel_sendpage(con->sock, page,
-				      con->out_msg_pos.page_pos, len,
+				      con->out_msg_pos.page_pos + page_shift,
+				      len,
 				      MSG_DONTWAIT | MSG_NOSIGNAL |
 				      MSG_MORE);
 
-		if (crc && (msg->pages || msg->pagelist))
+		if (crc &&
+		    (msg->pages || msg->pagelist || msg->bio || in_trail))
 			kunmap(page);
 
 		if (ret <= 0)
@@ -783,9 +859,16 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 			con->out_msg_pos.page_pos = 0;
 			con->out_msg_pos.page++;
 			con->out_msg_pos.did_page_crc = 0;
-			if (msg->pagelist)
+			if (in_trail)
+				list_move_tail(&page->lru,
+					       &msg->trail->head);
+			else if (msg->pagelist)
 				list_move_tail(&page->lru,
 					       &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+			else if (msg->bio)
+				iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
 		}
 	}
 
@@ -938,7 +1021,7 @@ static int verify_hello(struct ceph_connection *con)
 {
 	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
 		pr_err("connect to %s got bad banner\n",
-		       pr_addr(&con->peer_addr.in_addr));
+		       ceph_pr_addr(&con->peer_addr.in_addr));
 		con->error_msg = "protocol error, bad banner";
 		return -1;
 	}
@@ -1041,7 +1124,7 @@ int ceph_parse_ips(const char *c, const char *end,
 
 		addr_set_port(ss, port);
 
-		dout("parse_ips got %s\n", pr_addr(ss));
+		dout("parse_ips got %s\n", ceph_pr_addr(ss));
 
 		if (p == end)
 			break;
@@ -1061,6 +1144,7 @@ bad:
 	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
 	return -EINVAL;
 }
+EXPORT_SYMBOL(ceph_parse_ips);
 
 static int process_banner(struct ceph_connection *con)
 {
@@ -1082,9 +1166,9 @@ static int process_banner(struct ceph_connection *con)
 	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
 		pr_warning("wrong peer, want %s/%d, got %s/%d\n",
-			   pr_addr(&con->peer_addr.in_addr),
+			   ceph_pr_addr(&con->peer_addr.in_addr),
 			   (int)le32_to_cpu(con->peer_addr.nonce),
-			   pr_addr(&con->actual_peer_addr.in_addr),
+			   ceph_pr_addr(&con->actual_peer_addr.in_addr),
 			   (int)le32_to_cpu(con->actual_peer_addr.nonce));
 		con->error_msg = "wrong peer at address";
 		return -1;
@@ -1102,7 +1186,7 @@ static int process_banner(struct ceph_connection *con)
 		addr_set_port(&con->msgr->inst.addr.in_addr, port);
 		encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
-		     pr_addr(&con->msgr->inst.addr.in_addr));
+		     ceph_pr_addr(&con->msgr->inst.addr.in_addr));
 	}
 
 	set_bit(NEGOTIATING, &con->state);
@@ -1123,8 +1207,8 @@ static void fail_protocol(struct ceph_connection *con)
 
 static int process_connect(struct ceph_connection *con)
 {
-	u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-	u64 req_feat = CEPH_FEATURE_REQUIRED;
+	u64 sup_feat = con->msgr->supported_features;
+	u64 req_feat = con->msgr->required_features;
 	u64 server_feat = le64_to_cpu(con->in_reply.features);
 
 	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1134,7 +1218,7 @@ static int process_connect(struct ceph_connection *con)
 		pr_err("%s%lld %s feature set mismatch,"
 		       " my %llx < server's %llx, missing %llx\n",
 		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
 		       sup_feat, server_feat, server_feat & ~sup_feat);
 		con->error_msg = "missing required protocol features";
 		fail_protocol(con);
@@ -1144,7 +1228,7 @@ static int process_connect(struct ceph_connection *con)
 		pr_err("%s%lld %s protocol version mismatch,"
 		       " my %d != server's %d\n",
 		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
 		       le32_to_cpu(con->out_connect.protocol_version),
 		       le32_to_cpu(con->in_reply.protocol_version));
 		con->error_msg = "protocol version mismatch";
@@ -1178,7 +1262,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_connect.connect_seq));
 		pr_err("%s%lld %s connection reset\n",
 		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr));
+		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
 		prepare_write_connect(con->msgr, con, 0);
 		prepare_read_connect(con);
@@ -1223,7 +1307,7 @@ static int process_connect(struct ceph_connection *con)
 			pr_err("%s%lld %s protocol feature mismatch,"
 			       " my required %llx > server's %llx, need %llx\n",
 			       ENTITY_NAME(con->peer_name),
-			       pr_addr(&con->peer_addr.in_addr),
+			       ceph_pr_addr(&con->peer_addr.in_addr),
 			       req_feat, server_feat, req_feat & ~server_feat);
 			con->error_msg = "missing required protocol features";
 			fail_protocol(con);
@@ -1305,8 +1389,7 @@ static int read_partial_message_section(struct ceph_connection *con,
 					struct kvec *section,
 					unsigned int sec_len, u32 *crc)
 {
-	int left;
-	int ret;
+	int ret, left;
 
 	BUG_ON(!section);
 
@@ -1329,13 +1412,83 @@ static int read_partial_message_section(struct ceph_connection *con,
 static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 				struct ceph_msg_header *hdr,
 				int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+				      struct page **pages,
+				      unsigned data_len, int datacrc)
+{
+	void *p;
+	int ret;
+	int left;
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+	/* (page) data */
+	BUG_ON(pages == NULL);
+	p = kmap(pages[con->in_msg_pos.page]);
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(pages[con->in_msg_pos.page]);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+		con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.page++;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+				    struct bio **bio_iter, int *bio_seg,
+				    unsigned data_len, int datacrc)
+{
+	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+	void *p;
+	int ret, left;
+
+	if (IS_ERR(bv))
+		return PTR_ERR(bv);
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+	p = kmap(bv->bv_page) + bv->bv_offset;
+
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(bv->bv_page);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == bv->bv_len) {
+		con->in_msg_pos.page_pos = 0;
+		iter_bio_next(bio_iter, bio_seg);
+	}
+
+	return ret;
+}
+#endif
+
 /*
  * read (part of) a message.
  */
 static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
-	void *p;
 	int ret;
 	int to, left;
 	unsigned front_len, middle_len, data_len, data_off;
@@ -1381,7 +1534,7 @@ static int read_partial_message(struct ceph_connection *con)
 	if ((s64)seq - (s64)con->in_seq < 1) {
 		pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
 			ENTITY_NAME(con->peer_name),
-			pr_addr(&con->peer_addr.in_addr),
+			ceph_pr_addr(&con->peer_addr.in_addr),
 			seq, con->in_seq + 1);
 		con->in_base_pos = -front_len - middle_len - data_len -
 			sizeof(m->footer);
@@ -1422,7 +1575,10 @@ static int read_partial_message(struct ceph_connection *con)
 			m->middle->vec.iov_len = 0;
 
 		con->in_msg_pos.page = 0;
-		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		if (m->pages)
+			con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		else
+			con->in_msg_pos.page_pos = 0;
 		con->in_msg_pos.data_pos = 0;
 	}
 
@@ -1440,27 +1596,29 @@ static int read_partial_message(struct ceph_connection *con)
 		if (ret <= 0)
 			return ret;
 	}
+#ifdef CONFIG_BLOCK
+	if (m->bio && !m->bio_iter)
+		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
 
 	/* (page) data */
 	while (con->in_msg_pos.data_pos < data_len) {
-		left = min((int)(data_len - con->in_msg_pos.data_pos),
-			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-		BUG_ON(m->pages == NULL);
-		p = kmap(m->pages[con->in_msg_pos.page]);
-		ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-				       left);
-		if (ret > 0 && datacrc)
-			con->in_data_crc =
-				crc32c(con->in_data_crc,
-					  p + con->in_msg_pos.page_pos, ret);
-		kunmap(m->pages[con->in_msg_pos.page]);
-		if (ret <= 0)
-			return ret;
-		con->in_msg_pos.data_pos += ret;
-		con->in_msg_pos.page_pos += ret;
-		if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-			con->in_msg_pos.page_pos = 0;
-			con->in_msg_pos.page++;
+		if (m->pages) {
+			ret = read_partial_message_pages(con, m->pages,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#ifdef CONFIG_BLOCK
+		} else if (m->bio) {
+
+			ret = read_partial_message_bio(con,
+						 &m->bio_iter, &m->bio_seg,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#endif
+		} else {
+			BUG_ON(1);
 		}
 	}
 
@@ -1874,9 +2032,9 @@ out:
 static void ceph_fault(struct ceph_connection *con)
 {
 	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-	       pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
 	dout("fault %p state %lu to peer %s\n",
-	     con, con->state, pr_addr(&con->peer_addr.in_addr));
+	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
 
 	if (test_bit(LOSSYTX, &con->state)) {
 		dout("fault on LOSSYTX channel\n");
@@ -1936,7 +2094,9 @@ out:
 /*
  * create a new messenger instance
  */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
+					     u32 supported_features,
+					     u32 required_features)
 {
 	struct ceph_messenger *msgr;
 
@@ -1944,6 +2104,9 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
 	if (msgr == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	msgr->supported_features = supported_features;
+	msgr->required_features = required_features;
+
 	spin_lock_init(&msgr->global_seq_lock);
 
 	/* the zero page is needed if a request is "canceled" while the message
@@ -1966,6 +2129,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
 	dout("messenger_create %p\n", msgr);
 	return msgr;
 }
+EXPORT_SYMBOL(ceph_messenger_create);
 
 void ceph_messenger_destroy(struct ceph_messenger *msgr)
 {
@@ -1975,6 +2139,7 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
 	kfree(msgr);
 	dout("destroyed messenger %p\n", msgr);
 }
+EXPORT_SYMBOL(ceph_messenger_destroy);
 
 /*
  * Queue up an outgoing message on the given connection.
@@ -2011,6 +2176,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
 		queue_con(con);
 }
+EXPORT_SYMBOL(ceph_con_send);
 
 /*
  * Revoke a message that was previously queued for send
@@ -2076,6 +2242,7 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
 		queue_con(con);
 }
+EXPORT_SYMBOL(ceph_con_keepalive);
 
 
 /*
@@ -2136,6 +2303,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
 	m->nr_pages = 0;
 	m->pages = NULL;
 	m->pagelist = NULL;
+	m->bio = NULL;
+	m->bio_iter = NULL;
+	m->bio_seg = 0;
+	m->trail = NULL;
 
 	dout("ceph_msg_new %p front %d\n", m, front_len);
 	return m;
@@ -2146,6 +2317,7 @@ out:
 	pr_err("msg_new can't create type %d front %d\n", type, front_len);
 	return NULL;
 }
+EXPORT_SYMBOL(ceph_msg_new);
 
 /*
  * Allocate "middle" portion of a message, if it is needed and wasn't
@@ -2250,11 +2422,14 @@ void ceph_msg_last_put(struct kref *kref)
 		m->pagelist = NULL;
 	}
 
+	m->trail = NULL;
+
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
 	else
 		ceph_msg_kfree(m);
 }
+EXPORT_SYMBOL(ceph_msg_last_put);
 
 void ceph_msg_dump(struct ceph_msg *msg)
 {
@@ -2275,3 +2450,4 @@ void ceph_msg_dump(struct ceph_msg *msg)
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->footer, sizeof(msg->footer), true);
 }
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/fs/ceph/mon_client.c b/net/ceph/mon_client.c
index b2a5a3e4a67..8a079399174 100644
--- a/fs/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1,14 +1,16 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/sched.h>
 
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/ceph/auth.h>
 
 /*
  * Interact with Ceph monitor cluster.  Handle requests for new map
@@ -74,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	     m->num_mon);
 	for (i = 0; i < m->num_mon; i++)
 		dout("monmap_decode  mon%d is %s\n", i,
-		     pr_addr(&m->mon_inst[i].addr.in_addr));
+		     ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
 	return m;
 
 bad:
@@ -191,30 +193,33 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 		struct ceph_msg *msg = monc->m_subscribe;
 		struct ceph_mon_subscribe_item *i;
 		void *p, *end;
+		int num;
 
 		p = msg->front.iov_base;
 		end = p + msg->front_max;
 
-		dout("__send_subscribe to 'mdsmap' %u+\n",
-		     (unsigned)monc->have_mdsmap);
+		num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+		ceph_encode_32(&p, num);
+
 		if (monc->want_next_osdmap) {
 			dout("__send_subscribe to 'osdmap' %u\n",
 			     (unsigned)monc->have_osdmap);
-			ceph_encode_32(&p, 3);
 			ceph_encode_string(&p, end, "osdmap", 6);
 			i = p;
 			i->have = cpu_to_le64(monc->have_osdmap);
 			i->onetime = 1;
 			p += sizeof(*i);
 			monc->want_next_osdmap = 2;  /* requested */
-		} else {
-			ceph_encode_32(&p, 2);
 		}
-		ceph_encode_string(&p, end, "mdsmap", 6);
-		i = p;
-		i->have = cpu_to_le64(monc->have_mdsmap);
-		i->onetime = 0;
-		p += sizeof(*i);
+		if (monc->want_mdsmap) {
+			dout("__send_subscribe to 'mdsmap' %u+\n",
+			     (unsigned)monc->have_mdsmap);
+			ceph_encode_string(&p, end, "mdsmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_mdsmap);
+			i->onetime = 0;
+			p += sizeof(*i);
+		}
 		ceph_encode_string(&p, end, "monmap", 6);
 		i = p;
 		i->have = 0;
@@ -243,7 +248,8 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
 	mutex_lock(&monc->mutex);
 	if (monc->hunting) {
 		pr_info("mon%d %s session established\n",
-			monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+			monc->cur_mon,
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
 		monc->hunting = false;
 	}
 	dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -266,6 +272,7 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
 	mutex_unlock(&monc->mutex);
 	return 0;
 }
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
 
 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
 {
@@ -310,6 +317,7 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
 	mutex_unlock(&monc->mutex);
 	return 0;
 }
+EXPORT_SYMBOL(ceph_monc_open_session);
 
 /*
  * The monitor responds with mount ack indicate mount success.  The
@@ -540,6 +548,7 @@ out:
 	kref_put(&req->kref, release_generic_request);
 	return err;
 }
+EXPORT_SYMBOL(ceph_monc_do_statfs);
 
 /*
  * pool ops
@@ -651,6 +660,7 @@ int ceph_monc_create_snapid(struct ceph_mon_client *monc,
 				   pool, 0, (char *)snapid, sizeof(*snapid));
 
 }
+EXPORT_SYMBOL(ceph_monc_create_snapid);
 
 int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 snapid)
@@ -708,9 +718,9 @@ static void delayed_work(struct work_struct *work)
  */
 static int build_initial_monmap(struct ceph_mon_client *monc)
 {
-	struct ceph_mount_args *args = monc->client->mount_args;
-	struct ceph_entity_addr *mon_addr = args->mon_addr;
-	int num_mon = args->num_mon;
+	struct ceph_options *opt = monc->client->options;
+	struct ceph_entity_addr *mon_addr = opt->mon_addr;
+	int num_mon = opt->num_mon;
 	int i;
 
 	/* build initial monmap */
@@ -728,11 +738,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 	}
 	monc->monmap->num_mon = num_mon;
 	monc->have_fsid = false;
-
-	/* release addr memory */
-	kfree(args->mon_addr);
-	args->mon_addr = NULL;
-	args->num_mon = 0;
 	return 0;
 }
 
@@ -753,8 +758,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->con = NULL;
 
 	/* authentication */
-	monc->auth = ceph_auth_init(cl->mount_args->name,
-				    cl->mount_args->secret);
+	monc->auth = ceph_auth_init(cl->options->name,
+				    cl->options->secret);
 	if (IS_ERR(monc->auth))
 		return PTR_ERR(monc->auth);
 	monc->auth->want_keys =
@@ -808,6 +813,7 @@ out_monmap:
 out:
 	return err;
 }
+EXPORT_SYMBOL(ceph_monc_init);
 
 void ceph_monc_stop(struct ceph_mon_client *monc)
 {
@@ -832,6 +838,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	kfree(monc->monmap);
 }
+EXPORT_SYMBOL(ceph_monc_stop);
 
 static void handle_auth_reply(struct ceph_mon_client *monc,
 			      struct ceph_msg *msg)
@@ -889,6 +896,7 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 	mutex_unlock(&monc->mutex);
 	return ret;
 }
+EXPORT_SYMBOL(ceph_monc_validate_auth);
 
 /*
  * handle incoming message
@@ -922,15 +930,16 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		ceph_monc_handle_map(monc, msg);
 		break;
 
-	case CEPH_MSG_MDS_MAP:
-		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
-		break;
-
 	case CEPH_MSG_OSD_MAP:
 		ceph_osdc_handle_map(&monc->client->osdc, msg);
 		break;
 
 	default:
+		/* can the chained handler handle it? */
+		if (monc->client->extra_mon_dispatch &&
+		    monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+			break;
+			
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
@@ -994,7 +1003,7 @@ static void mon_fault(struct ceph_connection *con)
 	if (monc->con && !monc->hunting)
 		pr_info("mon%d %s session lost, "
 			"hunting for new mon\n", monc->cur_mon,
-			pr_addr(&monc->con->peer_addr.in_addr));
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
 
 	__close_session(monc);
 	if (!monc->hunting) {
diff --git a/fs/ceph/msgpool.c b/net/ceph/msgpool.c
index dd65a643813..d5f2d97ac05 100644
--- a/fs/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
-#include "msgpool.h"
+#include <linux/ceph/msgpool.h>
 
 static void *alloc_fn(gfp_t gfp_mask, void *arg)
 {
diff --git a/fs/ceph/osd_client.c b/net/ceph/osd_client.c
index 3b5571b8ce2..79391994b3e 100644
--- a/fs/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,17 +1,22 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/module.h>
 #include <linux/err.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
 
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
 
 #define OSD_OP_FRONT_LEN	4096
 #define OSD_OPREPLY_FRONT_LEN	512
@@ -22,6 +27,59 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 
 static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
 
+static int op_needs_trail(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+	case CEPH_OSD_OP_CALL:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int op_has_extent(int op)
+{
+	return (op == CEPH_OSD_OP_READ ||
+		op == CEPH_OSD_OP_WRITE);
+}
+
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+
+	reqhead->snapid = cpu_to_le64(snapid);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, plen, bno,
+				      &objoff, &objlen);
+	if (*plen < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+
+	if (op_has_extent(op->op)) {
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
+	if (op->op == CEPH_OSD_OP_WRITE)
+		op->payload_len = *plen;
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     *bno, objoff, objlen, req->r_num_pages);
+
+}
+EXPORT_SYMBOL(ceph_calc_raw_layout);
+
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -48,34 +106,19 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
  * fill osd op in request message.
  */
 static void calc_layout(struct ceph_osd_client *osdc,
-			struct ceph_vino vino, struct ceph_file_layout *layout,
+			struct ceph_vino vino,
+			struct ceph_file_layout *layout,
 			u64 off, u64 *plen,
-			struct ceph_osd_request *req)
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
 {
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_osd_op *op = (void *)(reqhead + 1);
-	u64 orig_len = *plen;
-	u64 objoff, objlen;    /* extent in object */
 	u64 bno;
 
-	reqhead->snapid = cpu_to_le64(vino.snap);
-
-	/* object extent? */
-	ceph_calc_file_object_mapping(layout, off, plen, &bno,
-				      &objoff, &objlen);
-	if (*plen < orig_len)
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - *plen, off, *plen);
+	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+			     plen, &bno, req, op);
 
 	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);
-
-	op->extent.offset = cpu_to_le64(objoff);
-	op->extent.length = cpu_to_le64(objlen);
-	req->r_num_pages = calc_pages_for(off, *plen);
-
-	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
-	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
 }
 
 /*
@@ -101,56 +144,66 @@ void ceph_osdc_release_request(struct kref *kref)
 	if (req->r_own_pages)
 		ceph_release_page_vector(req->r_pages,
 					 req->r_num_pages);
+#ifdef CONFIG_BLOCK
+	if (req->r_bio)
+		bio_put(req->r_bio);
+#endif
 	ceph_put_snap_context(req->r_snapc);
+	if (req->r_trail) {
+		ceph_pagelist_release(req->r_trail);
+		kfree(req->r_trail);
+	}
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
 		kfree(req);
 }
+EXPORT_SYMBOL(ceph_osdc_release_request);
 
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-					       struct ceph_file_layout *layout,
-					       struct ceph_vino vino,
-					       u64 off, u64 *plen,
-					       int opcode, int flags,
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+	int i = 0;
+
+	if (needs_trail)
+		*needs_trail = 0;
+	while (ops[i].op) {
+		if (needs_trail && op_needs_trail(ops[i].op))
+			*needs_trail = 1;
+		i++;
+	}
+
+	return i;
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
 					       struct ceph_snap_context *snapc,
-					       int do_sync,
-					       u32 truncate_seq,
-					       u64 truncate_size,
-					       struct timespec *mtime,
-					       bool use_mempool, int num_reply)
+					       struct ceph_osd_req_op *ops,
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages,
+					       struct bio *bio)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	struct ceph_osd_request_head *head;
-	struct ceph_osd_op *op;
-	void *p;
-	int num_op = 1 + do_sync;
-	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-	int i;
+	int needs_trail;
+	int num_op = get_num_ops(ops, &needs_trail);
+	size_t msg_size = sizeof(struct ceph_osd_request_head);
+
+	msg_size += num_op*sizeof(struct ceph_osd_op);
 
 	if (use_mempool) {
-		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+		req = mempool_alloc(osdc->req_mempool, gfp_flags);
 		memset(req, 0, sizeof(*req));
 	} else {
-		req = kzalloc(sizeof(*req), GFP_NOFS);
+		req = kzalloc(sizeof(*req), gfp_flags);
 	}
 	if (req == NULL)
 		return NULL;
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
+
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
@@ -164,13 +217,22 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
+				   OSD_OPREPLY_FRONT_LEN, gfp_flags);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
 	}
 	req->r_reply = msg;
 
+	/* allocate space for the trailing data */
+	if (needs_trail) {
+		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+		if (!req->r_trail) {
+			ceph_osdc_put_request(req);
+			return NULL;
+		}
+		ceph_pagelist_init(req->r_trail);
+	}
 	/* create request message; allow space for oid */
 	msg_size += 40;
 	if (snapc)
@@ -178,18 +240,115 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
 	}
+
 	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+	req->r_request = msg;
+	req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+	if (bio) {
+		req->r_bio = bio;
+		bio_get(req->r_bio);
+	}
+#endif
+
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static void osd_req_encode_op(struct ceph_osd_request *req,
+			      struct ceph_osd_op *dst,
+			      struct ceph_osd_req_op *src)
+{
+	dst->op = cpu_to_le16(src->op);
+
+	switch (dst->op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		dst->extent.offset =
+			cpu_to_le64(src->extent.offset);
+		dst->extent.length =
+			cpu_to_le64(src->extent.length);
+		dst->extent.truncate_size =
+			cpu_to_le64(src->extent.truncate_size);
+		dst->extent.truncate_seq =
+			cpu_to_le32(src->extent.truncate_seq);
+		break;
+
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+		BUG_ON(!req->r_trail);
+
+		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+		dst->xattr.cmp_op = src->xattr.cmp_op;
+		dst->xattr.cmp_mode = src->xattr.cmp_mode;
+		ceph_pagelist_append(req->r_trail, src->xattr.name,
+				     src->xattr.name_len);
+		ceph_pagelist_append(req->r_trail, src->xattr.val,
+				     src->xattr.value_len);
+		break;
+	case CEPH_OSD_OP_CALL:
+		BUG_ON(!req->r_trail);
+
+		dst->cls.class_len = src->cls.class_len;
+		dst->cls.method_len = src->cls.method_len;
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+		ceph_pagelist_append(req->r_trail, src->cls.class_name,
+				     src->cls.class_len);
+		ceph_pagelist_append(req->r_trail, src->cls.method_name,
+				     src->cls.method_len);
+		ceph_pagelist_append(req->r_trail, src->cls.indata,
+				     src->cls.indata_len);
+		break;
+	case CEPH_OSD_OP_ROLLBACK:
+		dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+		break;
+	case CEPH_OSD_OP_STARTSYNC:
+		break;
+	default:
+		pr_err("unrecognized osd opcode %d\n", dst->op);
+		WARN_ON(1);
+		break;
+	}
+	dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+			     u64 off, u64 *plen,
+			     struct ceph_osd_req_op *src_ops,
+			     struct ceph_snap_context *snapc,
+			     struct timespec *mtime,
+			     const char *oid,
+			     int oid_len)
+{
+	struct ceph_msg *msg = req->r_request;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_req_op *src_op;
+	struct ceph_osd_op *op;
+	void *p;
+	int num_op = get_num_ops(src_ops, NULL);
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int flags = req->r_flags;
+	u64 data_len = 0;
+	int i;
+
 	head = msg->front.iov_base;
 	op = (void *)(head + 1);
 	p = (void *)(op + num_op);
 
-	req->r_request = msg;
 	req->r_snapc = ceph_get_snap_context(snapc);
 
 	head->client_inc = cpu_to_le32(1); /* always, for now. */
@@ -197,29 +356,23 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (flags & CEPH_OSD_FLAG_WRITE)
 		ceph_encode_timespec(&head->mtime, mtime);
 	head->num_ops = cpu_to_le16(num_op);
-	op->op = cpu_to_le16(opcode);
 
-	/* calculate max write size */
-	calc_layout(osdc, vino, layout, off, plen, req);
-	req->r_file_layout = *layout;  /* keep a copy */
-
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		req->r_request->hdr.data_off = cpu_to_le16(off);
-		req->r_request->hdr.data_len = cpu_to_le32(*plen);
-		op->payload_len = cpu_to_le32(*plen);
-	}
-	op->extent.truncate_size = cpu_to_le64(truncate_size);
-	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
 
 	/* fill in oid */
-	head->object_len = cpu_to_le32(req->r_oid_len);
-	memcpy(p, req->r_oid, req->r_oid_len);
-	p += req->r_oid_len;
-
-	if (do_sync) {
+	head->object_len = cpu_to_le32(oid_len);
+	memcpy(p, oid, oid_len);
+	p += oid_len;
+
+	src_op = src_ops;
+	while (src_op->op) {
+		osd_req_encode_op(req, op, src_op);
+		src_op++;
 		op++;
-		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
 	}
+
+	if (req->r_trail)
+		data_len += req->r_trail->length;
+
 	if (snapc) {
 		head->snap_seq = cpu_to_le64(snapc->seq);
 		head->num_snaps = cpu_to_le32(snapc->num_snaps);
@@ -229,12 +382,79 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		}
 	}
 
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+	} else if (data_len) {
+		req->r_request->hdr.data_off = 0;
+		req->r_request->hdr.data_len = cpu_to_le32(data_len);
+	}
+
 	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
 	msg_size = p - msg->front.iov_base;
 	msg->front.iov_len = msg_size;
 	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return;
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply)
+{
+	struct ceph_osd_req_op ops[3];
+	struct ceph_osd_request *req;
+
+	ops[0].op = opcode;
+	ops[0].extent.truncate_seq = truncate_seq;
+	ops[0].extent.truncate_size = truncate_size;
+	ops[0].payload_len = 0;
+
+	if (do_sync) {
+		ops[1].op = CEPH_OSD_OP_STARTSYNC;
+		ops[1].payload_len = 0;
+		ops[2].op = 0;
+	} else
+		ops[1].op = 0;
+
+	req = ceph_osdc_alloc_request(osdc, flags,
+					 snapc, ops,
+					 use_mempool,
+					 GFP_NOFS, NULL, NULL);
+	if (IS_ERR(req))
+		return req;
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req, ops);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	ceph_osdc_build_request(req, off, plen, ops,
+				snapc,
+				mtime,
+				req->r_oid, req->r_oid_len);
+
 	return req;
 }
+EXPORT_SYMBOL(ceph_osdc_new_request);
 
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
@@ -389,7 +609,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
 	dout("__move_osd_to_lru %p\n", osd);
 	BUG_ON(!list_empty(&osd->o_osd_lru));
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-	osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
 }
 
 static void __remove_osd_from_lru(struct ceph_osd *osd)
@@ -483,7 +703,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
 	schedule_delayed_work(&osdc->timeout_work,
-			osdc->client->mount_args->osd_keepalive_timeout * HZ);
+			osdc->client->options->osd_keepalive_timeout * HZ);
 }
 
 static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -684,9 +904,9 @@ static void handle_timeout(struct work_struct *work)
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_osd_request *req, *last_req = NULL;
 	struct ceph_osd *osd;
-	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+	unsigned long timeout = osdc->client->options->osd_timeout * HZ;
 	unsigned long keepalive =
-		osdc->client->mount_args->osd_keepalive_timeout * HZ;
+		osdc->client->options->osd_keepalive_timeout * HZ;
 	unsigned long last_stamp = 0;
 	struct rb_node *p;
 	struct list_head slow_osds;
@@ -773,7 +993,7 @@ static void handle_osds_timeout(struct work_struct *work)
 		container_of(work, struct ceph_osd_client,
 			     osds_timeout_work.work);
 	unsigned long delay =
-		osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+		osdc->client->options->osd_idle_ttl * HZ >> 2;
 
 	dout("osds timeout\n");
 	down_read(&osdc->map_sem);
@@ -1104,6 +1324,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 
 	req->r_request->pages = req->r_pages;
 	req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+	req->r_request->bio = req->r_bio;
+#endif
+	req->r_request->trail = req->r_trail;
 
 	register_request(osdc, req);
 
@@ -1131,6 +1355,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 	up_read(&osdc->map_sem);
 	return rc;
 }
+EXPORT_SYMBOL(ceph_osdc_start_request);
 
 /*
  * wait for a request to complete
@@ -1153,6 +1378,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
 	return req->r_result;
 }
+EXPORT_SYMBOL(ceph_osdc_wait_request);
 
 /*
  * sync - wait for all in-flight requests to flush.  avoid starvation.
@@ -1186,6 +1412,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
 	mutex_unlock(&osdc->request_mutex);
 	dout("sync done (thru tid %llu)\n", last_tid);
 }
+EXPORT_SYMBOL(ceph_osdc_sync);
 
 /*
  * init, shutdown
@@ -1211,7 +1438,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
 
 	schedule_delayed_work(&osdc->osds_timeout_work,
-	   round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
+	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
 
 	err = -ENOMEM;
 	osdc->req_mempool = mempool_create_kmalloc_pool(10,
@@ -1237,6 +1464,7 @@ out_mempool:
 out:
 	return err;
 }
+EXPORT_SYMBOL(ceph_osdc_init);
 
 void ceph_osdc_stop(struct ceph_osd_client *osdc)
 {
@@ -1251,6 +1479,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
 }
+EXPORT_SYMBOL(ceph_osdc_stop);
 
 /*
  * Read some contiguous pages.  If we cross a stripe boundary, shorten
@@ -1288,6 +1517,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 	dout("readpages result %d\n", rc);
 	return rc;
 }
+EXPORT_SYMBOL(ceph_osdc_readpages);
 
 /*
  * do a synchronous write on N pages
@@ -1330,6 +1560,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	dout("writepages result %d\n", rc);
 	return rc;
 }
+EXPORT_SYMBOL(ceph_osdc_writepages);
 
 /*
  * handle incoming message
@@ -1420,6 +1651,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		}
 		m->pages = req->r_pages;
 		m->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+		m->bio = req->r_bio;
+#endif
 	}
 	*skip = 0;
 	req->r_con_filling_msg = ceph_con_get(con);
diff --git a/fs/ceph/osdmap.c b/net/ceph/osdmap.c
index e31f118f139..d73f3f6efa3 100644
--- a/fs/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1,14 +1,15 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/div64.h>
 
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
 
 char *ceph_osdmap_state_str(char *str, int len, int state)
 {
@@ -417,6 +418,20 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 	return NULL;
 }
 
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+	struct rb_node *rbp;
+
+	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rbp, struct ceph_pg_pool_info, node);
+		if (pi->name && strcmp(pi->name, name) == 0)
+			return pi->id;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
 static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
 {
 	rb_erase(&pi->node, root);
@@ -966,6 +981,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
 }
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
  * calculate an object layout (i.e. pgid) from an oid,
@@ -1011,6 +1027,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
 	return 0;
 }
+EXPORT_SYMBOL(ceph_calc_object_layout);
 
 /*
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
@@ -1108,3 +1125,4 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 			return osds[i];
 	return -1;
 }
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 00000000000..13cb409a7bb
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+	if (pl->mapped_tail) {
+		struct page *page = list_entry(pl->head.prev, struct page, lru);
+		kunmap(page);
+		pl->mapped_tail = NULL;
+	}
+}
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+	ceph_pagelist_unmap_tail(pl);
+	while (!list_empty(&pl->head)) {
+		struct page *page = list_first_entry(&pl->head, struct page,
+						     lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	ceph_pagelist_free_reserve(pl);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+	struct page *page;
+
+	if (!pl->num_pages_free) {
+		page = __page_cache_alloc(GFP_NOFS);
+	} else {
+		page = list_first_entry(&pl->free_list, struct page, lru);
+		list_del(&page->lru);
+		--pl->num_pages_free;
+	}
+	if (!page)
+		return -ENOMEM;
+	pl->room += PAGE_SIZE;
+	ceph_pagelist_unmap_tail(pl);
+	list_add_tail(&page->lru, &pl->head);
+	pl->mapped_tail = kmap(page);
+	return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+	while (pl->room < len) {
+		size_t bit = pl->room;
+		int ret;
+
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		       buf, bit);
+		pl->length += bit;
+		pl->room -= bit;
+		buf += bit;
+		len -= bit;
+		ret = ceph_pagelist_addpage(pl);
+		if (ret)
+			return ret;
+	}
+
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	pl->length += len;
+	pl->room -= len;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/**
+ * Allocate enough pages for a pagelist to append the given amount
+ * of data without without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+	if (space <= pl->room)
+		return 0;
+	space -= pl->room;
+	space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT;   /* conv to num pages */
+
+	while (space > pl->num_pages_free) {
+		struct page *page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return -ENOMEM;
+		list_add_tail(&page->lru, &pl->free_list);
+		++pl->num_pages_free;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/**
+ * Free any pages that have been preallocated.
+ */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+	while (!list_empty(&pl->free_list)) {
+		struct page *page = list_first_entry(&pl->free_list,
+						     struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+		--pl->num_pages_free;
+	}
+	BUG_ON(pl->num_pages_free);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
+
+/**
+ * Create a truncation point.
+ */
+void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+			      struct ceph_pagelist_cursor *c)
+{
+	c->pl = pl;
+	c->page_lru = pl->head.prev;
+	c->room = pl->room;
+}
+EXPORT_SYMBOL(ceph_pagelist_set_cursor);
+
+/**
+ * Truncate a pagelist to the given point. Move extra pages to reserve.
+ * This won't sleep.
+ * Returns: 0 on success,
+ *          -EINVAL if the pagelist doesn't match the trunc point pagelist
+ */
+int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+			   struct ceph_pagelist_cursor *c)
+{
+	struct page *page;
+
+	if (pl != c->pl)
+		return -EINVAL;
+	ceph_pagelist_unmap_tail(pl);
+	while (pl->head.prev != c->page_lru) {
+		page = list_entry(pl->head.prev, struct page, lru);
+		list_del(&page->lru);                /* remove from pagelist */
+		list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+		++pl->num_pages_free;
+	}
+	pl->room = c->room;
+	if (!list_empty(&pl->head)) {
+		page = list_entry(pl->head.prev, struct page, lru);
+		pl->mapped_tail = kmap(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 00000000000..54caf068715
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const char __user *data,
+						 int num_pages,
+						 loff_t off, size_t len)
+{
+	struct page **pages;
+	int rc;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	down_read(&current->mm->mmap_sem);
+	rc = get_user_pages(current, current->mm, (unsigned long)data,
+			    num_pages, 0, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		goto fail;
+	return pages;
+
+fail:
+	kfree(pages);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		__free_pages(pages[i], 0);
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+	struct page **pages;
+	int i;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, flags);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = __page_cache_alloc(flags);
+		if (pages[i] == NULL) {
+			ceph_release_page_vector(pages, i);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+					 const char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		po += l - bad;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(page_address(pages[i]) + po, data, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(data, page_address(pages[i]) + po, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+int ceph_copy_page_vector_to_user(struct page **pages,
+					 char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+		i++;
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+	int i = off >> PAGE_CACHE_SHIFT;
+
+	off &= ~PAGE_CACHE_MASK;
+
+	dout("zero_page_vector_page %u~%u\n", off, len);
+
+	/* leading partial page? */
+	if (off) {
+		int end = min((int)PAGE_CACHE_SIZE, off + len);
+		dout("zeroing %d %p head from %d\n", i, pages[i],
+		     (int)off);
+		zero_user_segment(pages[i], off, end);
+		len -= (end - off);
+		i++;
+	}
+	while (len >= PAGE_CACHE_SIZE) {
+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+		i++;
+	}
+	/* trailing partial page? */
+	if (len) {
+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+		zero_user_segment(pages[i], 0, len);
+	}
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 251997a9548..282806ba7a5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 	unlock_sock_fast(sk, slow);
 
 	/* skb is now orphaned, can be freed outside of locked section */
+	trace_kfree_skb(skb, skb_free_datagram_locked);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(skb_free_datagram_locked);
diff --git a/net/core/dev.c b/net/core/dev.c
index 660dd41aaaa..7ec85e27bee 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,6 +128,8 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
 #include <linux/pci.h>
 
 #include "net-sysfs.h"
@@ -1978,6 +1980,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		rc = ops->ndo_start_xmit(skb, dev);
+		trace_net_dev_xmit(skb, rc);
 		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
 		return rc;
@@ -1998,6 +2001,7 @@ gso:
 			skb_dst_drop(nskb);
 
 		rc = ops->ndo_start_xmit(nskb, dev);
+		trace_net_dev_xmit(nskb, rc);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
 				goto out_kfree_gso_skb;
@@ -2186,6 +2190,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
 #endif
+	trace_net_dev_queue(skb);
 	if (q->enqueue) {
 		rc = __dev_xmit_skb(skb, q, dev, txq);
 		goto out;
@@ -2512,6 +2517,7 @@ int netif_rx(struct sk_buff *skb)
 	if (netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
+	trace_netif_rx(skb);
 #ifdef CONFIG_RPS
 	{
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2577,7 @@ static void net_tx_action(struct softirq_action *h)
 			clist = clist->next;
 
 			WARN_ON(atomic_read(&skb->users));
+			trace_kfree_skb(skb, net_tx_action);
 			__kfree_skb(skb);
 		}
 	}
@@ -2828,6 +2835,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (!netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
+	trace_netif_receive_skb(skb);
 	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
 		return NET_RX_SUCCESS;
 
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380ed88..7f1bb2aba03 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/skb.h>
+#include <trace/events/net.h>
 #include <trace/events/napi.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c83b421341c..56ba3c4e476 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb)
 		smp_rmb();
 	else if (likely(!atomic_dec_and_test(&skb->users)))
 		return;
+	trace_consume_skb(skb);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(consume_skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index ef30e9d286e..7d99e13148e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 #ifdef CONFIG_CGROUPS
 void sock_update_classid(struct sock *sk)
 {
-	u32 classid = task_cls_classid(current);
+	u32 classid;
 
+	rcu_read_lock();  /* doing current task, which cannot vanish. */
+	classid = task_cls_classid(current);
+	rcu_read_unlock();
 	if (classid && classid != sk->sk_classid)
 		sk->sk_classid = classid;
 }
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d6..37f8adb68c7 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/percpu.h>
+#include <linux/security.h>
 #include <net/net_namespace.h>
 
 #include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	int ret;
+	u32 len;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return ret;
+
+	ret = seq_printf(s, "secctx=%s ", secctx);
+
+	security_release_secctx(secctx, len);
+	return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
 static int ct_seq_show(struct seq_file *s, void *v)
 {
 	struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		goto release;
 #endif
 
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-	if (seq_printf(s, "secmark=%u ", ct->secmark))
+	if (ct_show_secctx(s, ct))
 		goto release;
-#endif
 
 	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
 		goto release;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93..957c9241fb0 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
 static struct nf_conntrack_l3proto *l3proto __read_mostly;
 
 #define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
 						__read_mostly;
 
 static inline const struct nf_nat_protocol *
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d33bf..fdaec7daff1 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
 
 static DEFINE_MUTEX(afinfo_mutex);
 
-const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
 EXPORT_SYMBOL(nf_afinfo);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc7649476..5702de35e2b 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
 
 static DEFINE_MUTEX(nf_ct_ecache_mutex);
 
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
 
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
 EXPORT_SYMBOL_GPL(nf_expect_event_cb);
 
 /* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 8d9e4c949b9..bd82450c193 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
 #include <linux/skbuff.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 
-static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM];
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
 static DEFINE_MUTEX(nf_ct_ext_type_mutex);
 
 void __nf_ct_ext_destroy(struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 5bae1cd15ee..146476c6441 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
 #include <linux/rculist_nulls.h>
 #include <linux/types.h>
 #include <linux/timer.h>
+#include <linux/security.h>
 #include <linux/skbuff.h>
 #include <linux/errno.h>
 #include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 static inline int
-ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
 {
-	NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark));
-	return 0;
+	struct nlattr *nest_secctx;
+	int len, ret;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return ret;
+
+	ret = -1;
+	nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+	if (!nest_secctx)
+		goto nla_put_failure;
+
+	NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+	nla_nest_end(skb, nest_secctx);
 
+	ret = 0;
 nla_put_failure:
-	return -1;
+	security_release_secctx(secctx, len);
+	return ret;
 }
 #else
-#define ctnetlink_dump_secmark(a, b) (0)
+#define ctnetlink_dump_secctx(a, b) (0)
 #endif
 
 #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_mark(skb, ct) < 0 ||
-	    ctnetlink_dump_secmark(skb, ct) < 0 ||
+	    ctnetlink_dump_secctx(skb, ct) < 0 ||
 	    ctnetlink_dump_id(skb, ct) < 0 ||
 	    ctnetlink_dump_use(skb, ct) < 0 ||
 	    ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
 	       ;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
+{
+	int len;
+
+	security_secid_to_secctx(ct->secmark, NULL, &len);
+
+	return sizeof(char) * len;
+}
+#endif
+
 static inline size_t
 ctnetlink_nlmsg_size(const struct nf_conn *ct)
 {
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
 	       + nla_total_size(0) /* CTA_HELP */
 	       + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
-	       + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */
+	       + nla_total_size(0) /* CTA_SECCTX */
+	       + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
 #endif
 #ifdef CONFIG_NF_NAT_NEEDED
 	       + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 		if ((events & (1 << IPCT_SECMARK) || ct->secmark)
-		    && ctnetlink_dump_secmark(skb, ct) < 0)
+		    && ctnetlink_dump_secctx(skb, ct) < 0)
 			goto nla_put_failure;
 #endif
 
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1d52a..ed6d9295802 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
 
-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
-struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ct_l3protos);
 
 static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index eb973fcd67a..0fb65705b44 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
 #include <linux/seq_file.h>
 #include <linux/percpu.h>
 #include <linux/netdevice.h>
+#include <linux/security.h>
 #include <net/net_namespace.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	int ret;
+	u32 len;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return ret;
+
+	ret = seq_printf(s, "secctx=%s ", secctx);
+
+	security_release_secctx(secctx, len);
+	return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
 /* return 0 on success, 1 in case of error */
 static int ct_seq_show(struct seq_file *s, void *v)
 {
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		goto release;
 #endif
 
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-	if (seq_printf(s, "secmark=%u ", ct->secmark))
+	if (ct_show_secctx(s, ct))
 		goto release;
-#endif
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd786b..b07393eab88 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
 #define NF_LOG_PREFIXLEN		128
 #define NFLOGGER_NAME_LEN		64
 
-static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
 static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
 static DEFINE_MUTEX(nf_log_mutex);
 
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9c519..74aebed5bd2 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
  * long term mutex.  The handler must provide an an outfn() to accept packets
  * for queueing and must reinject all packets it receives, no matter what.
  */
-static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
 
 static DEFINE_MUTEX(queue_handler_mutex);
 
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 0cb6053f02f..782e51986a6 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
-#include <linux/selinux.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 23b2d6c486b..9faf5e050b7 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
+#include <linux/security.h>
 #include <linux/skbuff.h>
-#include <linux/selinux.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_SECMARK.h>
 
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	switch (mode) {
 	case SECMARK_MODE_SEL:
-		secmark = info->u.sel.selsid;
+		secmark = info->secid;
 		break;
-
 	default:
 		BUG();
 	}
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static int checkentry_selinux(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info *info)
 {
 	int err;
-	struct xt_secmark_target_selinux_info *sel = &info->u.sel;
 
-	sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0';
+	info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+	info->secid = 0;
 
-	err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+	err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+				       &info->secid);
 	if (err) {
 		if (err == -EINVAL)
-			pr_info("invalid SELinux context \'%s\'\n",
-				sel->selctx);
+			pr_info("invalid security context \'%s\'\n", info->secctx);
 		return err;
 	}
 
-	if (!sel->selsid) {
-		pr_info("unable to map SELinux context \'%s\'\n", sel->selctx);
+	if (!info->secid) {
+		pr_info("unable to map security context \'%s\'\n", info->secctx);
 		return -ENOENT;
 	}
 
-	err = selinux_secmark_relabel_packet_permission(sel->selsid);
+	err = security_secmark_relabel_packet(info->secid);
 	if (err) {
 		pr_info("unable to obtain relabeling permission\n");
 		return err;
 	}
 
-	selinux_secmark_refcount_inc();
+	security_secmark_refcount_inc();
 	return 0;
 }
 
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
 
 	switch (info->mode) {
 	case SECMARK_MODE_SEL:
-		err = checkentry_selinux(info);
-		if (err <= 0)
-			return err;
 		break;
-
 	default:
 		pr_info("invalid mode: %hu\n", info->mode);
 		return -EINVAL;
 	}
 
+	err = checkentry_lsm(info);
+	if (err)
+		return err;
+
 	if (!mode)
 		mode = info->mode;
 	return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
 {
 	switch (mode) {
 	case SECMARK_MODE_SEL:
-		selinux_secmark_refcount_dec();
+		security_secmark_refcount_dec();
 	}
 }
 
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 78ef2c5e130..37dff78e9cb 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	 * calls by looking at the number of nested bh disable calls because
 	 * softirqs always disables bh.
 	 */
-	if (softirq_count() != SOFTIRQ_OFFSET) {
+	if (in_serving_softirq()) {
 		/* If there is an sk_classid we'll use that. */
 		if (!skb->sk)
 			return -1;
diff --git a/scripts/Makefile b/scripts/Makefile
index 842dbc2d5ae..2e088109fbd 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -11,6 +11,7 @@ hostprogs-$(CONFIG_KALLSYMS)     += kallsyms
 hostprogs-$(CONFIG_LOGO)         += pnmtologo
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
+hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount
 
 always		:= $(hostprogs-y) $(hostprogs-m)
 
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index a1a5cf95a68..843bd4f4ffc 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -209,12 +209,22 @@ cmd_modversions =								\
 endif
 
 ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ifdef BUILD_C_RECORDMCOUNT
+# Due to recursion, we must skip empty.o.
+# The empty.o file is created in the make process in order to determine
+#  the target endianness and word size. It is made before all other C
+#  files, including recordmcount.
+cmd_record_mcount = if [ $(@) != "scripts/mod/empty.o" ]; then			\
+			$(objtree)/scripts/recordmcount "$(@)";			\
+		    fi;
+else
 cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
 	"$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \
 	"$(if $(CONFIG_64BIT),64,32)" \
 	"$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \
 	"$(if $(part-of-module),1,0)" "$(@)";
 endif
+endif
 
 define rule_cc_o_c
 	$(call echo-cmd,checksrc) $(cmd_checksrc)			  \
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 54fd1b70013..7bfcf1a09ac 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -101,14 +101,6 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
-#hash values
-ifdef CONFIG_DYNAMIC_DEBUG
-debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\
-              -D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))"
-else
-debug_flags =
-endif
-
 orig_c_flags   = $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(KBUILD_SUBDIR_CCFLAGS) \
                  $(ccflags-y) $(CFLAGS_$(basetarget).o)
 _c_flags       = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags))
@@ -152,8 +144,7 @@ endif
 
 c_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
 		 $(__c_flags) $(modkern_cflags)                           \
-		 -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags) \
-		  $(debug_flags)
+		 -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags)
 
 a_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
 		 $(__a_flags) $(modkern_aflags)
diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile
index 09559951df1..4c324a1f1e0 100644
--- a/scripts/basic/Makefile
+++ b/scripts/basic/Makefile
@@ -9,7 +9,7 @@
 # fixdep: 	 Used to generate dependency information during build process
 # docproc:	 Used in Documentation/DocBook
 
-hostprogs-y	:= fixdep docproc hash
+hostprogs-y	:= fixdep docproc
 always		:= $(hostprogs-y)
 
 # fixdep is needed to compile other host programs
diff --git a/scripts/basic/hash.c b/scripts/basic/hash.c
deleted file mode 100644
index 2ef5d3f666b..00000000000
--- a/scripts/basic/hash.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (C) 2008 Red Hat, Inc., Jason Baron <jbaron@redhat.com>
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define DYNAMIC_DEBUG_HASH_BITS 6
-
-static const char *program;
-
-static void usage(void)
-{
-	printf("Usage: %s <djb2|r5> <modname>\n", program);
-	exit(1);
-}
-
-/* djb2 hashing algorithm by Dan Bernstein. From:
- * http://www.cse.yorku.ca/~oz/hash.html
- */
-
-static unsigned int djb2_hash(char *str)
-{
-	unsigned long hash = 5381;
-	int c;
-
-	c = *str;
-	while (c) {
-		hash = ((hash << 5) + hash) + c;
-		c = *++str;
-	}
-	return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1));
-}
-
-static unsigned int r5_hash(char *str)
-{
-	unsigned long hash = 0;
-	int c;
-
-	c = *str;
-	while (c) {
-		hash = (hash + (c << 4) + (c >> 4)) * 11;
-		c = *++str;
-	}
-	return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1));
-}
-
-int main(int argc, char *argv[])
-{
-	program = argv[0];
-
-	if (argc != 3)
-		usage();
-	if (!strcmp(argv[1], "djb2"))
-		printf("%d\n", djb2_hash(argv[2]));
-	else if (!strcmp(argv[1], "r5"))
-		printf("%d\n", r5_hash(argv[2]));
-	else
-		usage();
-	exit(0);
-}
-
diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh
new file mode 100644
index 00000000000..520d16b1ffa
--- /dev/null
+++ b/scripts/gcc-goto.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# Test for gcc 'asm goto' suport
+# Copyright (C) 2010, Jason Baron <jbaron@redhat.com>
+
+echo "int main(void) { entry: asm goto (\"\"::::entry); return 0; }" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y"
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
new file mode 100644
index 00000000000..26e1271259b
--- /dev/null
+++ b/scripts/recordmcount.c
@@ -0,0 +1,363 @@
+/*
+ * recordmcount.c: construct a table of the locations of calls to 'mcount'
+ * so that ftrace can find them quickly.
+ * Copyright 2009 John F. Reiser <jreiser@BitWagon.com>.  All rights reserved.
+ * Licensed under the GNU General Public License, version 2 (GPLv2).
+ *
+ * Restructured to fit Linux format, as well as other updates:
+ *  Copyright 2010 Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
+ */
+
+/*
+ * Strategy: alter the .o file in-place.
+ *
+ * Append a new STRTAB that has the new section names, followed by a new array
+ * ElfXX_Shdr[] that has the new section headers, followed by the section
+ * contents for __mcount_loc and its relocations.  The old shstrtab strings,
+ * and the old ElfXX_Shdr[] array, remain as "garbage" (commonly, a couple
+ * kilobytes.)  Subsequent processing by /bin/ld (or the kernel module loader)
+ * will ignore the garbage regions, because they are not designated by the
+ * new .e_shoff nor the new ElfXX_Shdr[].  [In order to remove the garbage,
+ * then use "ld -r" to create a new file that omits the garbage.]
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <setjmp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static int fd_map;	/* File descriptor for file being modified. */
+static int mmap_failed; /* Boolean flag. */
+static void *ehdr_curr; /* current ElfXX_Ehdr *  for resource cleanup */
+static char gpfx;	/* prefix for global symbol name (sometimes '_') */
+static struct stat sb;	/* Remember .st_size, etc. */
+static jmp_buf jmpenv;	/* setjmp/longjmp per-file error escape */
+
+/* setjmp() return values */
+enum {
+	SJ_SETJMP = 0,  /* hardwired first return */
+	SJ_FAIL,
+	SJ_SUCCEED
+};
+
+/* Per-file resource cleanup when multiple files. */
+static void
+cleanup(void)
+{
+	if (!mmap_failed)
+		munmap(ehdr_curr, sb.st_size);
+	else
+		free(ehdr_curr);
+	close(fd_map);
+}
+
+static void __attribute__((noreturn))
+fail_file(void)
+{
+	cleanup();
+	longjmp(jmpenv, SJ_FAIL);
+}
+
+static void __attribute__((noreturn))
+succeed_file(void)
+{
+	cleanup();
+	longjmp(jmpenv, SJ_SUCCEED);
+}
+
+/* ulseek, uread, ...:  Check return value for errors. */
+
+static off_t
+ulseek(int const fd, off_t const offset, int const whence)
+{
+	off_t const w = lseek(fd, offset, whence);
+	if ((off_t)-1 == w) {
+		perror("lseek");
+		fail_file();
+	}
+	return w;
+}
+
+static size_t
+uread(int const fd, void *const buf, size_t const count)
+{
+	size_t const n = read(fd, buf, count);
+	if (n != count) {
+		perror("read");
+		fail_file();
+	}
+	return n;
+}
+
+static size_t
+uwrite(int const fd, void const *const buf, size_t const count)
+{
+	size_t const n = write(fd, buf, count);
+	if (n != count) {
+		perror("write");
+		fail_file();
+	}
+	return n;
+}
+
+static void *
+umalloc(size_t size)
+{
+	void *const addr = malloc(size);
+	if (0 == addr) {
+		fprintf(stderr, "malloc failed: %zu bytes\n", size);
+		fail_file();
+	}
+	return addr;
+}
+
+/*
+ * Get the whole file as a programming convenience in order to avoid
+ * malloc+lseek+read+free of many pieces.  If successful, then mmap
+ * avoids copying unused pieces; else just read the whole file.
+ * Open for both read and write; new info will be appended to the file.
+ * Use MAP_PRIVATE so that a few changes to the in-memory ElfXX_Ehdr
+ * do not propagate to the file until an explicit overwrite at the last.
+ * This preserves most aspects of consistency (all except .st_size)
+ * for simultaneous readers of the file while we are appending to it.
+ * However, multiple writers still are bad.  We choose not to use
+ * locking because it is expensive and the use case of kernel build
+ * makes multiple writers unlikely.
+ */
+static void *mmap_file(char const *fname)
+{
+	void *addr;
+
+	fd_map = open(fname, O_RDWR);
+	if (0 > fd_map || 0 > fstat(fd_map, &sb)) {
+		perror(fname);
+		fail_file();
+	}
+	if (!S_ISREG(sb.st_mode)) {
+		fprintf(stderr, "not a regular file: %s\n", fname);
+		fail_file();
+	}
+	addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE,
+		    fd_map, 0);
+	mmap_failed = 0;
+	if (MAP_FAILED == addr) {
+		mmap_failed = 1;
+		addr = umalloc(sb.st_size);
+		uread(fd_map, addr, sb.st_size);
+	}
+	return addr;
+}
+
+/* w8rev, w8nat, ...: Handle endianness. */
+
+static uint64_t w8rev(uint64_t const x)
+{
+	return   ((0xff & (x >> (0 * 8))) << (7 * 8))
+	       | ((0xff & (x >> (1 * 8))) << (6 * 8))
+	       | ((0xff & (x >> (2 * 8))) << (5 * 8))
+	       | ((0xff & (x >> (3 * 8))) << (4 * 8))
+	       | ((0xff & (x >> (4 * 8))) << (3 * 8))
+	       | ((0xff & (x >> (5 * 8))) << (2 * 8))
+	       | ((0xff & (x >> (6 * 8))) << (1 * 8))
+	       | ((0xff & (x >> (7 * 8))) << (0 * 8));
+}
+
+static uint32_t w4rev(uint32_t const x)
+{
+	return   ((0xff & (x >> (0 * 8))) << (3 * 8))
+	       | ((0xff & (x >> (1 * 8))) << (2 * 8))
+	       | ((0xff & (x >> (2 * 8))) << (1 * 8))
+	       | ((0xff & (x >> (3 * 8))) << (0 * 8));
+}
+
+static uint32_t w2rev(uint16_t const x)
+{
+	return   ((0xff & (x >> (0 * 8))) << (1 * 8))
+	       | ((0xff & (x >> (1 * 8))) << (0 * 8));
+}
+
+static uint64_t w8nat(uint64_t const x)
+{
+	return x;
+}
+
+static uint32_t w4nat(uint32_t const x)
+{
+	return x;
+}
+
+static uint32_t w2nat(uint16_t const x)
+{
+	return x;
+}
+
+static uint64_t (*w8)(uint64_t);
+static uint32_t (*w)(uint32_t);
+static uint32_t (*w2)(uint16_t);
+
+/* Names of the sections that could contain calls to mcount. */
+static int
+is_mcounted_section_name(char const *const txtname)
+{
+	return 0 == strcmp(".text",          txtname) ||
+		0 == strcmp(".sched.text",    txtname) ||
+		0 == strcmp(".spinlock.text", txtname) ||
+		0 == strcmp(".irqentry.text", txtname) ||
+		0 == strcmp(".text.unlikely", txtname);
+}
+
+/* 32 bit and 64 bit are very similar */
+#include "recordmcount.h"
+#define RECORD_MCOUNT_64
+#include "recordmcount.h"
+
+static void
+do_file(char const *const fname)
+{
+	Elf32_Ehdr *const ehdr = mmap_file(fname);
+	unsigned int reltype = 0;
+
+	ehdr_curr = ehdr;
+	w = w4nat;
+	w2 = w2nat;
+	w8 = w8nat;
+	switch (ehdr->e_ident[EI_DATA]) {
+		static unsigned int const endian = 1;
+	default: {
+		fprintf(stderr, "unrecognized ELF data encoding %d: %s\n",
+			ehdr->e_ident[EI_DATA], fname);
+		fail_file();
+	} break;
+	case ELFDATA2LSB: {
+		if (1 != *(unsigned char const *)&endian) {
+			/* main() is big endian, file.o is little endian. */
+			w = w4rev;
+			w2 = w2rev;
+			w8 = w8rev;
+		}
+	} break;
+	case ELFDATA2MSB: {
+		if (0 != *(unsigned char const *)&endian) {
+			/* main() is little endian, file.o is big endian. */
+			w = w4rev;
+			w2 = w2rev;
+			w8 = w8rev;
+		}
+	} break;
+	}  /* end switch */
+	if (0 != memcmp(ELFMAG, ehdr->e_ident, SELFMAG)
+	||  ET_REL != w2(ehdr->e_type)
+	||  EV_CURRENT != ehdr->e_ident[EI_VERSION]) {
+		fprintf(stderr, "unrecognized ET_REL file %s\n", fname);
+		fail_file();
+	}
+
+	gpfx = 0;
+	switch (w2(ehdr->e_machine)) {
+	default: {
+		fprintf(stderr, "unrecognized e_machine %d %s\n",
+			w2(ehdr->e_machine), fname);
+		fail_file();
+	} break;
+	case EM_386:	 reltype = R_386_32;                   break;
+	case EM_ARM:	 reltype = R_ARM_ABS32;                break;
+	case EM_IA_64:	 reltype = R_IA64_IMM64;   gpfx = '_'; break;
+	case EM_PPC:	 reltype = R_PPC_ADDR32;   gpfx = '_'; break;
+	case EM_PPC64:	 reltype = R_PPC64_ADDR64; gpfx = '_'; break;
+	case EM_S390:    /* reltype: e_class    */ gpfx = '_'; break;
+	case EM_SH:	 reltype = R_SH_DIR32;                 break;
+	case EM_SPARCV9: reltype = R_SPARC_64;     gpfx = '_'; break;
+	case EM_X86_64:	 reltype = R_X86_64_64;                break;
+	}  /* end switch */
+
+	switch (ehdr->e_ident[EI_CLASS]) {
+	default: {
+		fprintf(stderr, "unrecognized ELF class %d %s\n",
+			ehdr->e_ident[EI_CLASS], fname);
+		fail_file();
+	} break;
+	case ELFCLASS32: {
+		if (sizeof(Elf32_Ehdr) != w2(ehdr->e_ehsize)
+		||  sizeof(Elf32_Shdr) != w2(ehdr->e_shentsize)) {
+			fprintf(stderr,
+				"unrecognized ET_REL file: %s\n", fname);
+			fail_file();
+		}
+		if (EM_S390 == w2(ehdr->e_machine))
+			reltype = R_390_32;
+		do32(ehdr, fname, reltype);
+	} break;
+	case ELFCLASS64: {
+		Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr;
+		if (sizeof(Elf64_Ehdr) != w2(ghdr->e_ehsize)
+		||  sizeof(Elf64_Shdr) != w2(ghdr->e_shentsize)) {
+			fprintf(stderr,
+				"unrecognized ET_REL file: %s\n", fname);
+			fail_file();
+		}
+		if (EM_S390 == w2(ghdr->e_machine))
+			reltype = R_390_64;
+		do64(ghdr, fname, reltype);
+	} break;
+	}  /* end switch */
+
+	cleanup();
+}
+
+int
+main(int argc, char const *argv[])
+{
+	const char ftrace[] = "kernel/trace/ftrace.o";
+	int ftrace_size = sizeof(ftrace) - 1;
+	int n_error = 0;  /* gcc-4.3.0 false positive complaint */
+
+	if (argc <= 1) {
+		fprintf(stderr, "usage: recordmcount file.o...\n");
+		return 0;
+	}
+
+	/* Process each file in turn, allowing deep failure. */
+	for (--argc, ++argv; 0 < argc; --argc, ++argv) {
+		int const sjval = setjmp(jmpenv);
+		int len;
+
+		/*
+		 * The file kernel/trace/ftrace.o references the mcount
+		 * function but does not call it. Since ftrace.o should
+		 * not be traced anyway, we just skip it.
+		 */
+		len = strlen(argv[0]);
+		if (len >= ftrace_size &&
+		    strcmp(argv[0] + (len - ftrace_size), ftrace) == 0)
+			continue;
+
+		switch (sjval) {
+		default: {
+			fprintf(stderr, "internal error: %s\n", argv[0]);
+			exit(1);
+		} break;
+		case SJ_SETJMP: {  /* normal sequence */
+			/* Avoid problems if early cleanup() */
+			fd_map = -1;
+			ehdr_curr = NULL;
+			mmap_failed = 1;
+			do_file(argv[0]);
+		} break;
+		case SJ_FAIL: {  /* error in do_file or below */
+			++n_error;
+		} break;
+		case SJ_SUCCEED: {  /* premature success */
+			/* do nothing */
+		} break;
+		}  /* end switch */
+	}
+	return !!n_error;
+}
+
+
diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h
new file mode 100644
index 00000000000..7f39d0943d2
--- /dev/null
+++ b/scripts/recordmcount.h
@@ -0,0 +1,366 @@
+/*
+ * recordmcount.h
+ *
+ * This code was taken out of recordmcount.c written by
+ * Copyright 2009 John F. Reiser <jreiser@BitWagon.com>.  All rights reserved.
+ *
+ * The original code had the same algorithms for both 32bit
+ * and 64bit ELF files, but the code was duplicated to support
+ * the difference in structures that were used. This
+ * file creates a macro of everything that is different between
+ * the 64 and 32 bit code, such that by including this header
+ * twice we can create both sets of functions by including this
+ * header once with RECORD_MCOUNT_64 undefined, and again with
+ * it defined.
+ *
+ * This conversion to macros was done by:
+ * Copyright 2010 Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
+ *
+ * Licensed under the GNU General Public License, version 2 (GPLv2).
+ */
+#undef append_func
+#undef sift_rel_mcount
+#undef find_secsym_ndx
+#undef __has_rel_mcount
+#undef has_rel_mcount
+#undef tot_relsize
+#undef do_func
+#undef Elf_Ehdr
+#undef Elf_Shdr
+#undef Elf_Rel
+#undef Elf_Rela
+#undef Elf_Sym
+#undef ELF_R_SYM
+#undef ELF_R_INFO
+#undef ELF_ST_BIND
+#undef uint_t
+#undef _w
+#undef _align
+#undef _size
+
+#ifdef RECORD_MCOUNT_64
+# define append_func		append64
+# define sift_rel_mcount	sift64_rel_mcount
+# define find_secsym_ndx	find64_secsym_ndx
+# define __has_rel_mcount	__has64_rel_mcount
+# define has_rel_mcount		has64_rel_mcount
+# define tot_relsize		tot64_relsize
+# define do_func		do64
+# define Elf_Ehdr		Elf64_Ehdr
+# define Elf_Shdr		Elf64_Shdr
+# define Elf_Rel		Elf64_Rel
+# define Elf_Rela		Elf64_Rela
+# define Elf_Sym		Elf64_Sym
+# define ELF_R_SYM		ELF64_R_SYM
+# define ELF_R_INFO		ELF64_R_INFO
+# define ELF_ST_BIND		ELF64_ST_BIND
+# define uint_t			uint64_t
+# define _w			w8
+# define _align			7u
+# define _size			8
+#else
+# define append_func		append32
+# define sift_rel_mcount	sift32_rel_mcount
+# define find_secsym_ndx	find32_secsym_ndx
+# define __has_rel_mcount	__has32_rel_mcount
+# define has_rel_mcount		has32_rel_mcount
+# define tot_relsize		tot32_relsize
+# define do_func		do32
+# define Elf_Ehdr		Elf32_Ehdr
+# define Elf_Shdr		Elf32_Shdr
+# define Elf_Rel		Elf32_Rel
+# define Elf_Rela		Elf32_Rela
+# define Elf_Sym		Elf32_Sym
+# define ELF_R_SYM		ELF32_R_SYM
+# define ELF_R_INFO		ELF32_R_INFO
+# define ELF_ST_BIND		ELF32_ST_BIND
+# define uint_t			uint32_t
+# define _w			w
+# define _align			3u
+# define _size			4
+#endif
+
+/* Append the new shstrtab, Elf_Shdr[], __mcount_loc and its relocations. */
+static void append_func(Elf_Ehdr *const ehdr,
+			Elf_Shdr *const shstr,
+			uint_t const *const mloc0,
+			uint_t const *const mlocp,
+			Elf_Rel const *const mrel0,
+			Elf_Rel const *const mrelp,
+			unsigned int const rel_entsize,
+			unsigned int const symsec_sh_link)
+{
+	/* Begin constructing output file */
+	Elf_Shdr mcsec;
+	char const *mc_name = (sizeof(Elf_Rela) == rel_entsize)
+		? ".rela__mcount_loc"
+		:  ".rel__mcount_loc";
+	unsigned const old_shnum = w2(ehdr->e_shnum);
+	uint_t const old_shoff = _w(ehdr->e_shoff);
+	uint_t const old_shstr_sh_size   = _w(shstr->sh_size);
+	uint_t const old_shstr_sh_offset = _w(shstr->sh_offset);
+	uint_t t = 1 + strlen(mc_name) + _w(shstr->sh_size);
+	uint_t new_e_shoff;
+
+	shstr->sh_size = _w(t);
+	shstr->sh_offset = _w(sb.st_size);
+	t += sb.st_size;
+	t += (_align & -t);  /* word-byte align */
+	new_e_shoff = t;
+
+	/* body for new shstrtab */
+	ulseek(fd_map, sb.st_size, SEEK_SET);
+	uwrite(fd_map, old_shstr_sh_offset + (void *)ehdr, old_shstr_sh_size);
+	uwrite(fd_map, mc_name, 1 + strlen(mc_name));
+
+	/* old(modified) Elf_Shdr table, word-byte aligned */
+	ulseek(fd_map, t, SEEK_SET);
+	t += sizeof(Elf_Shdr) * old_shnum;
+	uwrite(fd_map, old_shoff + (void *)ehdr,
+	       sizeof(Elf_Shdr) * old_shnum);
+
+	/* new sections __mcount_loc and .rel__mcount_loc */
+	t += 2*sizeof(mcsec);
+	mcsec.sh_name = w((sizeof(Elf_Rela) == rel_entsize) + strlen(".rel")
+		+ old_shstr_sh_size);
+	mcsec.sh_type = w(SHT_PROGBITS);
+	mcsec.sh_flags = _w(SHF_ALLOC);
+	mcsec.sh_addr = 0;
+	mcsec.sh_offset = _w(t);
+	mcsec.sh_size = _w((void *)mlocp - (void *)mloc0);
+	mcsec.sh_link = 0;
+	mcsec.sh_info = 0;
+	mcsec.sh_addralign = _w(_size);
+	mcsec.sh_entsize = _w(_size);
+	uwrite(fd_map, &mcsec, sizeof(mcsec));
+
+	mcsec.sh_name = w(old_shstr_sh_size);
+	mcsec.sh_type = (sizeof(Elf_Rela) == rel_entsize)
+		? w(SHT_RELA)
+		: w(SHT_REL);
+	mcsec.sh_flags = 0;
+	mcsec.sh_addr = 0;
+	mcsec.sh_offset = _w((void *)mlocp - (void *)mloc0 + t);
+	mcsec.sh_size   = _w((void *)mrelp - (void *)mrel0);
+	mcsec.sh_link = w(symsec_sh_link);
+	mcsec.sh_info = w(old_shnum);
+	mcsec.sh_addralign = _w(_size);
+	mcsec.sh_entsize = _w(rel_entsize);
+	uwrite(fd_map, &mcsec, sizeof(mcsec));
+
+	uwrite(fd_map, mloc0, (void *)mlocp - (void *)mloc0);
+	uwrite(fd_map, mrel0, (void *)mrelp - (void *)mrel0);
+
+	ehdr->e_shoff = _w(new_e_shoff);
+	ehdr->e_shnum = w2(2 + w2(ehdr->e_shnum));  /* {.rel,}__mcount_loc */
+	ulseek(fd_map, 0, SEEK_SET);
+	uwrite(fd_map, ehdr, sizeof(*ehdr));
+}
+
+
+/*
+ * Look at the relocations in order to find the calls to mcount.
+ * Accumulate the section offsets that are found, and their relocation info,
+ * onto the end of the existing arrays.
+ */
+static uint_t *sift_rel_mcount(uint_t *mlocp,
+			       unsigned const offbase,
+			       Elf_Rel **const mrelpp,
+			       Elf_Shdr const *const relhdr,
+			       Elf_Ehdr const *const ehdr,
+			       unsigned const recsym,
+			       uint_t const recval,
+			       unsigned const reltype)
+{
+	uint_t *const mloc0 = mlocp;
+	Elf_Rel *mrelp = *mrelpp;
+	Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff)
+		+ (void *)ehdr);
+	unsigned const symsec_sh_link = w(relhdr->sh_link);
+	Elf_Shdr const *const symsec = &shdr0[symsec_sh_link];
+	Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset)
+		+ (void *)ehdr);
+
+	Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)];
+	char const *const str0 = (char const *)(_w(strsec->sh_offset)
+		+ (void *)ehdr);
+
+	Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset)
+		+ (void *)ehdr);
+	unsigned rel_entsize = _w(relhdr->sh_entsize);
+	unsigned const nrel = _w(relhdr->sh_size) / rel_entsize;
+	Elf_Rel const *relp = rel0;
+
+	unsigned mcountsym = 0;
+	unsigned t;
+
+	for (t = nrel; t; --t) {
+		if (!mcountsym) {
+			Elf_Sym const *const symp =
+				&sym0[ELF_R_SYM(_w(relp->r_info))];
+			char const *symname = &str0[w(symp->st_name)];
+
+			if ('.' == symname[0])
+				++symname;  /* ppc64 hack */
+			if (0 == strcmp((('_' == gpfx) ? "_mcount" : "mcount"),
+					symname))
+				mcountsym = ELF_R_SYM(_w(relp->r_info));
+		}
+
+		if (mcountsym == ELF_R_SYM(_w(relp->r_info))) {
+			uint_t const addend = _w(_w(relp->r_offset) - recval);
+
+			mrelp->r_offset = _w(offbase
+				+ ((void *)mlocp - (void *)mloc0));
+			mrelp->r_info = _w(ELF_R_INFO(recsym, reltype));
+			if (sizeof(Elf_Rela) == rel_entsize) {
+				((Elf_Rela *)mrelp)->r_addend = addend;
+				*mlocp++ = 0;
+			} else
+				*mlocp++ = addend;
+
+			mrelp = (Elf_Rel *)(rel_entsize + (void *)mrelp);
+		}
+		relp = (Elf_Rel const *)(rel_entsize + (void *)relp);
+	}
+	*mrelpp = mrelp;
+	return mlocp;
+}
+
+
+/*
+ * Find a symbol in the given section, to be used as the base for relocating
+ * the table of offsets of calls to mcount.  A local or global symbol suffices,
+ * but avoid a Weak symbol because it may be overridden; the change in value
+ * would invalidate the relocations of the offsets of the calls to mcount.
+ * Often the found symbol will be the unnamed local symbol generated by
+ * GNU 'as' for the start of each section.  For example:
+ *    Num:    Value  Size Type    Bind   Vis      Ndx Name
+ *      2: 00000000     0 SECTION LOCAL  DEFAULT    1
+ */
+static unsigned find_secsym_ndx(unsigned const txtndx,
+				char const *const txtname,
+				uint_t *const recvalp,
+				Elf_Shdr const *const symhdr,
+				Elf_Ehdr const *const ehdr)
+{
+	Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symhdr->sh_offset)
+		+ (void *)ehdr);
+	unsigned const nsym = _w(symhdr->sh_size) / _w(symhdr->sh_entsize);
+	Elf_Sym const *symp;
+	unsigned t;
+
+	for (symp = sym0, t = nsym; t; --t, ++symp) {
+		unsigned int const st_bind = ELF_ST_BIND(symp->st_info);
+
+		if (txtndx == w2(symp->st_shndx)
+			/* avoid STB_WEAK */
+		    && (STB_LOCAL == st_bind || STB_GLOBAL == st_bind)) {
+			*recvalp = _w(symp->st_value);
+			return symp - sym0;
+		}
+	}
+	fprintf(stderr, "Cannot find symbol for section %d: %s.\n",
+		txtndx, txtname);
+	fail_file();
+}
+
+
+/* Evade ISO C restriction: no declaration after statement in has_rel_mcount. */
+static char const *
+__has_rel_mcount(Elf_Shdr const *const relhdr,  /* is SHT_REL or SHT_RELA */
+		 Elf_Shdr const *const shdr0,
+		 char const *const shstrtab,
+		 char const *const fname)
+{
+	/* .sh_info depends on .sh_type == SHT_REL[,A] */
+	Elf_Shdr const *const txthdr = &shdr0[w(relhdr->sh_info)];
+	char const *const txtname = &shstrtab[w(txthdr->sh_name)];
+
+	if (0 == strcmp("__mcount_loc", txtname)) {
+		fprintf(stderr, "warning: __mcount_loc already exists: %s\n",
+			fname);
+		succeed_file();
+	}
+	if (SHT_PROGBITS != w(txthdr->sh_type) ||
+	    !is_mcounted_section_name(txtname))
+		return NULL;
+	return txtname;
+}
+
+static char const *has_rel_mcount(Elf_Shdr const *const relhdr,
+				  Elf_Shdr const *const shdr0,
+				  char const *const shstrtab,
+				  char const *const fname)
+{
+	if (SHT_REL  != w(relhdr->sh_type) && SHT_RELA != w(relhdr->sh_type))
+		return NULL;
+	return __has_rel_mcount(relhdr, shdr0, shstrtab, fname);
+}
+
+
+static unsigned tot_relsize(Elf_Shdr const *const shdr0,
+			    unsigned nhdr,
+			    const char *const shstrtab,
+			    const char *const fname)
+{
+	unsigned totrelsz = 0;
+	Elf_Shdr const *shdrp = shdr0;
+
+	for (; nhdr; --nhdr, ++shdrp) {
+		if (has_rel_mcount(shdrp, shdr0, shstrtab, fname))
+			totrelsz += _w(shdrp->sh_size);
+	}
+	return totrelsz;
+}
+
+
+/* Overall supervision for Elf32 ET_REL file. */
+static void
+do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype)
+{
+	Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff)
+		+ (void *)ehdr);
+	unsigned const nhdr = w2(ehdr->e_shnum);
+	Elf_Shdr *const shstr = &shdr0[w2(ehdr->e_shstrndx)];
+	char const *const shstrtab = (char const *)(_w(shstr->sh_offset)
+		+ (void *)ehdr);
+
+	Elf_Shdr const *relhdr;
+	unsigned k;
+
+	/* Upper bound on space: assume all relevant relocs are for mcount. */
+	unsigned const totrelsz = tot_relsize(shdr0, nhdr, shstrtab, fname);
+	Elf_Rel *const mrel0 = umalloc(totrelsz);
+	Elf_Rel *      mrelp = mrel0;
+
+	/* 2*sizeof(address) <= sizeof(Elf_Rel) */
+	uint_t *const mloc0 = umalloc(totrelsz>>1);
+	uint_t *      mlocp = mloc0;
+
+	unsigned rel_entsize = 0;
+	unsigned symsec_sh_link = 0;
+
+	for (relhdr = shdr0, k = nhdr; k; --k, ++relhdr) {
+		char const *const txtname = has_rel_mcount(relhdr, shdr0,
+			shstrtab, fname);
+		if (txtname) {
+			uint_t recval = 0;
+			unsigned const recsym = find_secsym_ndx(
+				w(relhdr->sh_info), txtname, &recval,
+				&shdr0[symsec_sh_link = w(relhdr->sh_link)],
+				ehdr);
+
+			rel_entsize = _w(relhdr->sh_entsize);
+			mlocp = sift_rel_mcount(mlocp,
+				(void *)mlocp - (void *)mloc0, &mrelp,
+				relhdr, ehdr, recsym, recval, reltype);
+		}
+	}
+	if (mloc0 != mlocp) {
+		append_func(ehdr, shstr, mloc0, mlocp, mrel0, mrelp,
+			    rel_entsize, symsec_sh_link);
+	}
+	free(mrel0);
+	free(mloc0);
+}
diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore
index 0a0a99f3b08..4d995aeaebc 100644
--- a/security/apparmor/.gitignore
+++ b/security/apparmor/.gitignore
@@ -3,3 +3,4 @@
 #
 af_names.h
 capability_names.h
+rlim_names.h
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 7320331b44a..544ff5837cb 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -29,7 +29,7 @@
  * aa_simple_write_to_buffer - common routine for getting policy from user
  * @op: operation doing the user buffer copy
  * @userbuf: user buffer to copy data from  (NOT NULL)
- * @alloc_size: size of user buffer
+ * @alloc_size: size of user buffer (REQUIRES: @alloc_size >= @copy_size)
  * @copy_size: size of data to copy from user buffer
  * @pos: position write is at in the file (NOT NULL)
  *
@@ -42,6 +42,8 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf,
 {
 	char *data;
 
+	BUG_ON(copy_size > alloc_size);
+
 	if (*pos != 0)
 		/* only writes from pos 0, that is complete writes */
 		return ERR_PTR(-ESPIPE);
diff --git a/security/capability.c b/security/capability.c
index 95a6599a37b..30ae00fbecd 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -677,7 +677,18 @@ static void cap_inet_conn_established(struct sock *sk, struct sk_buff *skb)
 {
 }
 
+static int cap_secmark_relabel_packet(u32 secid)
+{
+	return 0;
+}
 
+static void cap_secmark_refcount_inc(void)
+{
+}
+
+static void cap_secmark_refcount_dec(void)
+{
+}
 
 static void cap_req_classify_flow(const struct request_sock *req,
 				  struct flowi *fl)
@@ -777,7 +788,8 @@ static int cap_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 
 static int cap_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 {
-	return -EOPNOTSUPP;
+	*secid = 0;
+	return 0;
 }
 
 static void cap_release_secctx(char *secdata, u32 seclen)
@@ -1018,6 +1030,9 @@ void __init security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, inet_conn_request);
 	set_to_cap_if_null(ops, inet_csk_clone);
 	set_to_cap_if_null(ops, inet_conn_established);
+	set_to_cap_if_null(ops, secmark_relabel_packet);
+	set_to_cap_if_null(ops, secmark_refcount_inc);
+	set_to_cap_if_null(ops, secmark_refcount_dec);
 	set_to_cap_if_null(ops, req_classify_flow);
 	set_to_cap_if_null(ops, tun_dev_create);
 	set_to_cap_if_null(ops, tun_dev_post_create);
diff --git a/security/commoncap.c b/security/commoncap.c
index 9d172e6e330..5e632b4857e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -719,14 +719,11 @@ static int cap_safe_nice(struct task_struct *p)
 /**
  * cap_task_setscheduler - Detemine if scheduler policy change is permitted
  * @p: The task to affect
- * @policy: The policy to effect
- * @lp: The parameters to the scheduling policy
  *
  * Detemine if the requested scheduler policy change is permitted for the
  * specified task, returning 0 if permission is granted, -ve if denied.
  */
-int cap_task_setscheduler(struct task_struct *p, int policy,
-			   struct sched_param *lp)
+int cap_task_setscheduler(struct task_struct *p)
 {
 	return cap_safe_nice(p);
 }
diff --git a/security/security.c b/security/security.c
index c53949f17d9..b50f472061a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -89,20 +89,12 @@ __setup("security=", choose_lsm);
  * Return true if:
  *	-The passed LSM is the one chosen by user at boot time,
  *	-or the passed LSM is configured as the default and the user did not
- *	 choose an alternate LSM at boot time,
- *	-or there is no default LSM set and the user didn't specify a
- *	 specific LSM and we're the first to ask for registration permission,
- *	-or the passed LSM is currently loaded.
+ *	 choose an alternate LSM at boot time.
  * Otherwise, return false.
  */
 int __init security_module_enable(struct security_operations *ops)
 {
-	if (!*chosen_lsm)
-		strncpy(chosen_lsm, ops->name, SECURITY_NAME_MAX);
-	else if (strncmp(ops->name, chosen_lsm, SECURITY_NAME_MAX))
-		return 0;
-
-	return 1;
+	return !strcmp(ops->name, chosen_lsm);
 }
 
 /**
@@ -786,10 +778,9 @@ int security_task_setrlimit(struct task_struct *p, unsigned int resource,
 	return security_ops->task_setrlimit(p, resource, new_rlim);
 }
 
-int security_task_setscheduler(struct task_struct *p,
-				int policy, struct sched_param *lp)
+int security_task_setscheduler(struct task_struct *p)
 {
-	return security_ops->task_setscheduler(p, policy, lp);
+	return security_ops->task_setscheduler(p);
 }
 
 int security_task_getscheduler(struct task_struct *p)
@@ -1145,6 +1136,24 @@ void security_inet_conn_established(struct sock *sk,
 	security_ops->inet_conn_established(sk, skb);
 }
 
+int security_secmark_relabel_packet(u32 secid)
+{
+	return security_ops->secmark_relabel_packet(secid);
+}
+EXPORT_SYMBOL(security_secmark_relabel_packet);
+
+void security_secmark_refcount_inc(void)
+{
+	security_ops->secmark_refcount_inc();
+}
+EXPORT_SYMBOL(security_secmark_refcount_inc);
+
+void security_secmark_refcount_dec(void)
+{
+	security_ops->secmark_refcount_dec();
+}
+EXPORT_SYMBOL(security_secmark_refcount_dec);
+
 int security_tun_dev_create(void)
 {
 	return security_ops->tun_dev_create();
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index 58d80f3bd6f..ad5cd76ec23 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -2,25 +2,20 @@
 # Makefile for building the SELinux module as part of the kernel tree.
 #
 
-obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
-
-selinux-y := avc.o \
-	     hooks.o \
-	     selinuxfs.o \
-	     netlink.o \
-	     nlmsgtab.o \
-	     netif.o \
-	     netnode.o \
-	     netport.o \
-	     exports.o
+obj-$(CONFIG_SECURITY_SELINUX) := selinux.o
+
+selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \
+	     netnode.o netport.o exports.o \
+	     ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \
+	     ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o
 
 selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
 
 selinux-$(CONFIG_NETLABEL) += netlabel.o
 
-EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include
+ccflags-y := -Isecurity/selinux -Isecurity/selinux/include
 
-$(obj)/avc.o: $(obj)/flask.h
+$(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h
 
 quiet_cmd_flask = GEN     $(obj)/flask.h $(obj)/av_permissions.h
       cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index c0a454aee1e..90664385dea 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -11,58 +11,9 @@
  * it under the terms of the GNU General Public License version 2,
  * as published by the Free Software Foundation.
  */
-#include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/selinux.h>
-#include <linux/fs.h>
-#include <linux/ipc.h>
-#include <asm/atomic.h>
 
 #include "security.h"
-#include "objsec.h"
-
-/* SECMARK reference count */
-extern atomic_t selinux_secmark_refcount;
-
-int selinux_string_to_sid(char *str, u32 *sid)
-{
-	if (selinux_enabled)
-		return security_context_to_sid(str, strlen(str), sid);
-	else {
-		*sid = 0;
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(selinux_string_to_sid);
-
-int selinux_secmark_relabel_packet_permission(u32 sid)
-{
-	if (selinux_enabled) {
-		const struct task_security_struct *__tsec;
-		u32 tsid;
-
-		__tsec = current_security();
-		tsid = __tsec->sid;
-
-		return avc_has_perm(tsid, sid, SECCLASS_PACKET,
-				    PACKET__RELABELTO, NULL);
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
-
-void selinux_secmark_refcount_inc(void)
-{
-	atomic_inc(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
-
-void selinux_secmark_refcount_dec(void)
-{
-	atomic_dec(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
 
 bool selinux_is_enabled(void)
 {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4796ddd4e72..d9154cf90ae 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3354,11 +3354,11 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
 	return 0;
 }
 
-static int selinux_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp)
+static int selinux_task_setscheduler(struct task_struct *p)
 {
 	int rc;
 
-	rc = cap_task_setscheduler(p, policy, lp);
+	rc = cap_task_setscheduler(p);
 	if (rc)
 		return rc;
 
@@ -4279,6 +4279,27 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
 	selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
 }
 
+static int selinux_secmark_relabel_packet(u32 sid)
+{
+	const struct task_security_struct *__tsec;
+	u32 tsid;
+
+	__tsec = current_security();
+	tsid = __tsec->sid;
+
+	return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL);
+}
+
+static void selinux_secmark_refcount_inc(void)
+{
+	atomic_inc(&selinux_secmark_refcount);
+}
+
+static void selinux_secmark_refcount_dec(void)
+{
+	atomic_dec(&selinux_secmark_refcount);
+}
+
 static void selinux_req_classify_flow(const struct request_sock *req,
 				      struct flowi *fl)
 {
@@ -5533,6 +5554,9 @@ static struct security_operations selinux_ops = {
 	.inet_conn_request =		selinux_inet_conn_request,
 	.inet_csk_clone =		selinux_inet_csk_clone,
 	.inet_conn_established =	selinux_inet_conn_established,
+	.secmark_relabel_packet =	selinux_secmark_relabel_packet,
+	.secmark_refcount_inc =		selinux_secmark_refcount_inc,
+	.secmark_refcount_dec =		selinux_secmark_refcount_dec,
 	.req_classify_flow =		selinux_req_classify_flow,
 	.tun_dev_create =		selinux_tun_dev_create,
 	.tun_dev_post_create = 		selinux_tun_dev_post_create,
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index b4c9eb4bd6f..8858d2b2d4b 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -17,7 +17,7 @@ struct security_class_mapping secclass_map[] = {
 	  { "compute_av", "compute_create", "compute_member",
 	    "check_context", "load_policy", "compute_relabel",
 	    "compute_user", "setenforce", "setbool", "setsecparam",
-	    "setcheckreqprot", NULL } },
+	    "setcheckreqprot", "read_policy", NULL } },
 	{ "process",
 	  { "fork", "transition", "sigchld", "sigkill",
 	    "sigstop", "signull", "signal", "ptrace", "getsched", "setsched",
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 1f7c2491d3d..671273eb111 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -9,6 +9,7 @@
 #define _SELINUX_SECURITY_H_
 
 #include <linux/magic.h>
+#include <linux/types.h>
 #include "flask.h"
 
 #define SECSID_NULL			0x00000000 /* unspecified SID */
@@ -82,6 +83,8 @@ extern int selinux_policycap_openperm;
 int security_mls_enabled(void);
 
 int security_load_policy(void *data, size_t len);
+int security_read_policy(void **data, ssize_t *len);
+size_t security_policydb_len(void);
 
 int security_policycap_supported(unsigned int req_cap);
 
@@ -191,5 +194,25 @@ static inline int security_netlbl_sid_to_secattr(u32 sid,
 
 const char *security_get_initial_sid_context(u32 sid);
 
+/*
+ * status notifier using mmap interface
+ */
+extern struct page *selinux_kernel_status_page(void);
+
+#define SELINUX_KERNEL_STATUS_VERSION	1
+struct selinux_kernel_status {
+	u32	version;	/* version number of thie structure */
+	u32	sequence;	/* sequence number of seqlock logic */
+	u32	enforcing;	/* current setting of enforcing mode */
+	u32	policyload;	/* times of policy reloaded */
+	u32	deny_unknown;	/* current setting of deny_unknown */
+	/*
+	 * The version > 0 supports above members.
+	 */
+} __attribute__((packed));
+
+extern void selinux_status_update_setenforce(int enforcing);
+extern void selinux_status_update_policyload(int seqno);
+
 #endif /* _SELINUX_SECURITY_H_ */
 
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 79a1bb63566..87e0556bae7 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -68,6 +68,8 @@ static int *bool_pending_values;
 static struct dentry *class_dir;
 static unsigned long last_class_ino;
 
+static char policy_opened;
+
 /* global data for policy capabilities */
 static struct dentry *policycap_dir;
 
@@ -110,6 +112,8 @@ enum sel_inos {
 	SEL_COMPAT_NET,	/* whether to use old compat network packet controls */
 	SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */
 	SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */
+	SEL_STATUS,	/* export current status using mmap() */
+	SEL_POLICY,	/* allow userspace to read the in kernel policy */
 	SEL_INO_NEXT,	/* The next inode number to use */
 };
 
@@ -171,6 +175,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf,
 		if (selinux_enforcing)
 			avc_ss_reset(0);
 		selnl_notify_setenforce(selinux_enforcing);
+		selinux_status_update_setenforce(selinux_enforcing);
 	}
 	length = count;
 out:
@@ -205,6 +210,59 @@ static const struct file_operations sel_handle_unknown_ops = {
 	.llseek		= generic_file_llseek,
 };
 
+static int sel_open_handle_status(struct inode *inode, struct file *filp)
+{
+	struct page    *status = selinux_kernel_status_page();
+
+	if (!status)
+		return -ENOMEM;
+
+	filp->private_data = status;
+
+	return 0;
+}
+
+static ssize_t sel_read_handle_status(struct file *filp, char __user *buf,
+				      size_t count, loff_t *ppos)
+{
+	struct page    *status = filp->private_data;
+
+	BUG_ON(!status);
+
+	return simple_read_from_buffer(buf, count, ppos,
+				       page_address(status),
+				       sizeof(struct selinux_kernel_status));
+}
+
+static int sel_mmap_handle_status(struct file *filp,
+				  struct vm_area_struct *vma)
+{
+	struct page    *status = filp->private_data;
+	unsigned long	size = vma->vm_end - vma->vm_start;
+
+	BUG_ON(!status);
+
+	/* only allows one page from the head */
+	if (vma->vm_pgoff > 0 || size != PAGE_SIZE)
+		return -EIO;
+	/* disallow writable mapping */
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+	/* disallow mprotect() turns it into writable */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	return remap_pfn_range(vma, vma->vm_start,
+			       page_to_pfn(status),
+			       size, vma->vm_page_prot);
+}
+
+static const struct file_operations sel_handle_status_ops = {
+	.open		= sel_open_handle_status,
+	.read		= sel_read_handle_status,
+	.mmap		= sel_mmap_handle_status,
+	.llseek		= generic_file_llseek,
+};
+
 #ifdef CONFIG_SECURITY_SELINUX_DISABLE
 static ssize_t sel_write_disable(struct file *file, const char __user *buf,
 				 size_t count, loff_t *ppos)
@@ -296,6 +354,141 @@ static const struct file_operations sel_mls_ops = {
 	.llseek		= generic_file_llseek,
 };
 
+struct policy_load_memory {
+	size_t len;
+	void *data;
+};
+
+static int sel_open_policy(struct inode *inode, struct file *filp)
+{
+	struct policy_load_memory *plm = NULL;
+	int rc;
+
+	BUG_ON(filp->private_data);
+
+	mutex_lock(&sel_mutex);
+
+	rc = task_has_security(current, SECURITY__READ_POLICY);
+	if (rc)
+		goto err;
+
+	rc = -EBUSY;
+	if (policy_opened)
+		goto err;
+
+	rc = -ENOMEM;
+	plm = kzalloc(sizeof(*plm), GFP_KERNEL);
+	if (!plm)
+		goto err;
+
+	if (i_size_read(inode) != security_policydb_len()) {
+		mutex_lock(&inode->i_mutex);
+		i_size_write(inode, security_policydb_len());
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	rc = security_read_policy(&plm->data, &plm->len);
+	if (rc)
+		goto err;
+
+	policy_opened = 1;
+
+	filp->private_data = plm;
+
+	mutex_unlock(&sel_mutex);
+
+	return 0;
+err:
+	mutex_unlock(&sel_mutex);
+
+	if (plm)
+		vfree(plm->data);
+	kfree(plm);
+	return rc;
+}
+
+static int sel_release_policy(struct inode *inode, struct file *filp)
+{
+	struct policy_load_memory *plm = filp->private_data;
+
+	BUG_ON(!plm);
+
+	policy_opened = 0;
+
+	vfree(plm->data);
+	kfree(plm);
+
+	return 0;
+}
+
+static ssize_t sel_read_policy(struct file *filp, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct policy_load_memory *plm = filp->private_data;
+	int ret;
+
+	mutex_lock(&sel_mutex);
+
+	ret = task_has_security(current, SECURITY__READ_POLICY);
+	if (ret)
+		goto out;
+
+	ret = simple_read_from_buffer(buf, count, ppos, plm->data, plm->len);
+out:
+	mutex_unlock(&sel_mutex);
+	return ret;
+}
+
+static int sel_mmap_policy_fault(struct vm_area_struct *vma,
+				 struct vm_fault *vmf)
+{
+	struct policy_load_memory *plm = vma->vm_file->private_data;
+	unsigned long offset;
+	struct page *page;
+
+	if (vmf->flags & (FAULT_FLAG_MKWRITE | FAULT_FLAG_WRITE))
+		return VM_FAULT_SIGBUS;
+
+	offset = vmf->pgoff << PAGE_SHIFT;
+	if (offset >= roundup(plm->len, PAGE_SIZE))
+		return VM_FAULT_SIGBUS;
+
+	page = vmalloc_to_page(plm->data + offset);
+	get_page(page);
+
+	vmf->page = page;
+
+	return 0;
+}
+
+static struct vm_operations_struct sel_mmap_policy_ops = {
+	.fault = sel_mmap_policy_fault,
+	.page_mkwrite = sel_mmap_policy_fault,
+};
+
+int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_SHARED) {
+		/* do not allow mprotect to make mapping writable */
+		vma->vm_flags &= ~VM_MAYWRITE;
+
+		if (vma->vm_flags & VM_WRITE)
+			return -EACCES;
+	}
+
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &sel_mmap_policy_ops;
+
+	return 0;
+}
+
+static const struct file_operations sel_policy_ops = {
+	.open		= sel_open_policy,
+	.read		= sel_read_policy,
+	.mmap		= sel_mmap_policy,
+	.release	= sel_release_policy,
+};
+
 static ssize_t sel_write_load(struct file *file, const char __user *buf,
 			      size_t count, loff_t *ppos)
 
@@ -1612,6 +1805,8 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent)
 		[SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR},
 		[SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO},
 		[SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO},
+		[SEL_STATUS] = {"status", &sel_handle_status_ops, S_IRUGO},
+		[SEL_POLICY] = {"policy", &sel_policy_ops, S_IRUSR},
 		/* last one */ {""}
 	};
 	ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files);
diff --git a/security/selinux/ss/Makefile b/security/selinux/ss/Makefile
deleted file mode 100644
index 15d4e62917d..00000000000
--- a/security/selinux/ss/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for building the SELinux security server as part of the kernel tree.
-#
-
-EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include
-obj-y := ss.o
-
-ss-y := ebitmap.o hashtab.o symtab.o sidtab.o avtab.o policydb.o services.o conditional.o mls.o
-
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 929480c6c43..a3dd9faa19c 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -266,8 +266,8 @@ int avtab_alloc(struct avtab *h, u32 nrules)
 	if (shift > 2)
 		shift = shift - 2;
 	nslot = 1 << shift;
-	if (nslot > MAX_AVTAB_SIZE)
-		nslot = MAX_AVTAB_SIZE;
+	if (nslot > MAX_AVTAB_HASH_BUCKETS)
+		nslot = MAX_AVTAB_HASH_BUCKETS;
 	mask = nslot - 1;
 
 	h->htable = kcalloc(nslot, sizeof(*(h->htable)), GFP_KERNEL);
@@ -501,6 +501,48 @@ bad:
 	goto out;
 }
 
+int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
+{
+	__le16 buf16[4];
+	__le32 buf32[1];
+	int rc;
+
+	buf16[0] = cpu_to_le16(cur->key.source_type);
+	buf16[1] = cpu_to_le16(cur->key.target_type);
+	buf16[2] = cpu_to_le16(cur->key.target_class);
+	buf16[3] = cpu_to_le16(cur->key.specified);
+	rc = put_entry(buf16, sizeof(u16), 4, fp);
+	if (rc)
+		return rc;
+	buf32[0] = cpu_to_le32(cur->datum.data);
+	rc = put_entry(buf32, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+	return 0;
+}
+
+int avtab_write(struct policydb *p, struct avtab *a, void *fp)
+{
+	unsigned int i;
+	int rc = 0;
+	struct avtab_node *cur;
+	__le32 buf[1];
+
+	buf[0] = cpu_to_le32(a->nel);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < a->nslot; i++) {
+		for (cur = a->htable[i]; cur; cur = cur->next) {
+			rc = avtab_write_item(p, cur, fp);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return rc;
+}
 void avtab_cache_init(void)
 {
 	avtab_node_cachep = kmem_cache_create("avtab_node",
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h
index cd4f734e274..dff0c75345c 100644
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -71,6 +71,8 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
 		    void *p);
 
 int avtab_read(struct avtab *a, void *fp, struct policydb *pol);
+int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp);
+int avtab_write(struct policydb *p, struct avtab *a, void *fp);
 
 struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key,
 					  struct avtab_datum *datum);
@@ -85,7 +87,6 @@ void avtab_cache_destroy(void);
 #define MAX_AVTAB_HASH_BITS 11
 #define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS)
 #define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1)
-#define MAX_AVTAB_SIZE MAX_AVTAB_HASH_BUCKETS
 
 #endif	/* _SS_AVTAB_H_ */
 
diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c
index c91e150c308..655fe1c6cc6 100644
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -490,6 +490,129 @@ err:
 	return rc;
 }
 
+int cond_write_bool(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct cond_bool_datum *booldatum = datum;
+	struct policy_data *pd = ptr;
+	void *fp = pd->fp;
+	__le32 buf[3];
+	u32 len;
+	int rc;
+
+	len = strlen(key);
+	buf[0] = cpu_to_le32(booldatum->value);
+	buf[1] = cpu_to_le32(booldatum->state);
+	buf[2] = cpu_to_le32(len);
+	rc = put_entry(buf, sizeof(u32), 3, fp);
+	if (rc)
+		return rc;
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+	return 0;
+}
+
+/*
+ * cond_write_cond_av_list doesn't write out the av_list nodes.
+ * Instead it writes out the key/value pairs from the avtab. This
+ * is necessary because there is no way to uniquely identifying rules
+ * in the avtab so it is not possible to associate individual rules
+ * in the avtab with a conditional without saving them as part of
+ * the conditional. This means that the avtab with the conditional
+ * rules will not be saved but will be rebuilt on policy load.
+ */
+static int cond_write_av_list(struct policydb *p,
+			      struct cond_av_list *list, struct policy_file *fp)
+{
+	__le32 buf[1];
+	struct cond_av_list *cur_list;
+	u32 len;
+	int rc;
+
+	len = 0;
+	for (cur_list = list; cur_list != NULL; cur_list = cur_list->next)
+		len++;
+
+	buf[0] = cpu_to_le32(len);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	if (len == 0)
+		return 0;
+
+	for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) {
+		rc = avtab_write_item(p, cur_list->node, fp);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+int cond_write_node(struct policydb *p, struct cond_node *node,
+		    struct policy_file *fp)
+{
+	struct cond_expr *cur_expr;
+	__le32 buf[2];
+	int rc;
+	u32 len = 0;
+
+	buf[0] = cpu_to_le32(node->cur_state);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next)
+		len++;
+
+	buf[0] = cpu_to_le32(len);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) {
+		buf[0] = cpu_to_le32(cur_expr->expr_type);
+		buf[1] = cpu_to_le32(cur_expr->bool);
+		rc = put_entry(buf, sizeof(u32), 2, fp);
+		if (rc)
+			return rc;
+	}
+
+	rc = cond_write_av_list(p, node->true_list, fp);
+	if (rc)
+		return rc;
+	rc = cond_write_av_list(p, node->false_list, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+int cond_write_list(struct policydb *p, struct cond_node *list, void *fp)
+{
+	struct cond_node *cur;
+	u32 len;
+	__le32 buf[1];
+	int rc;
+
+	len = 0;
+	for (cur = list; cur != NULL; cur = cur->next)
+		len++;
+	buf[0] = cpu_to_le32(len);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	for (cur = list; cur != NULL; cur = cur->next) {
+		rc = cond_write_node(p, cur, fp);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
 /* Determine whether additional permissions are granted by the conditional
  * av table, and if so, add them to the result
  */
diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h
index 53ddb013ae5..3f209c63529 100644
--- a/security/selinux/ss/conditional.h
+++ b/security/selinux/ss/conditional.h
@@ -69,6 +69,8 @@ int cond_index_bool(void *key, void *datum, void *datap);
 
 int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp);
 int cond_read_list(struct policydb *p, void *fp);
+int cond_write_bool(void *key, void *datum, void *ptr);
+int cond_write_list(struct policydb *p, struct cond_node *list, void *fp);
 
 void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd);
 
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c
index 04b6145d767..d42951fcbe8 100644
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -22,6 +22,8 @@
 #include "ebitmap.h"
 #include "policydb.h"
 
+#define BITS_PER_U64	(sizeof(u64) * 8)
+
 int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2)
 {
 	struct ebitmap_node *n1, *n2;
@@ -363,10 +365,10 @@ int ebitmap_read(struct ebitmap *e, void *fp)
 	e->highbit = le32_to_cpu(buf[1]);
 	count = le32_to_cpu(buf[2]);
 
-	if (mapunit != sizeof(u64) * 8) {
+	if (mapunit != BITS_PER_U64) {
 		printk(KERN_ERR "SELinux: ebitmap: map size %u does not "
 		       "match my size %Zd (high bit was %d)\n",
-		       mapunit, sizeof(u64) * 8, e->highbit);
+		       mapunit, BITS_PER_U64, e->highbit);
 		goto bad;
 	}
 
@@ -446,3 +448,78 @@ bad:
 	ebitmap_destroy(e);
 	goto out;
 }
+
+int ebitmap_write(struct ebitmap *e, void *fp)
+{
+	struct ebitmap_node *n;
+	u32 count;
+	__le32 buf[3];
+	u64 map;
+	int bit, last_bit, last_startbit, rc;
+
+	buf[0] = cpu_to_le32(BITS_PER_U64);
+
+	count = 0;
+	last_bit = 0;
+	last_startbit = -1;
+	ebitmap_for_each_positive_bit(e, n, bit) {
+		if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
+			count++;
+			last_startbit = rounddown(bit, BITS_PER_U64);
+		}
+		last_bit = roundup(bit + 1, BITS_PER_U64);
+	}
+	buf[1] = cpu_to_le32(last_bit);
+	buf[2] = cpu_to_le32(count);
+
+	rc = put_entry(buf, sizeof(u32), 3, fp);
+	if (rc)
+		return rc;
+
+	map = 0;
+	last_startbit = INT_MIN;
+	ebitmap_for_each_positive_bit(e, n, bit) {
+		if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
+			__le64 buf64[1];
+
+			/* this is the very first bit */
+			if (!map) {
+				last_startbit = rounddown(bit, BITS_PER_U64);
+				map = (u64)1 << (bit - last_startbit);
+				continue;
+			}
+
+			/* write the last node */
+			buf[0] = cpu_to_le32(last_startbit);
+			rc = put_entry(buf, sizeof(u32), 1, fp);
+			if (rc)
+				return rc;
+
+			buf64[0] = cpu_to_le64(map);
+			rc = put_entry(buf64, sizeof(u64), 1, fp);
+			if (rc)
+				return rc;
+
+			/* set up for the next node */
+			map = 0;
+			last_startbit = rounddown(bit, BITS_PER_U64);
+		}
+		map |= (u64)1 << (bit - last_startbit);
+	}
+	/* write the last node */
+	if (map) {
+		__le64 buf64[1];
+
+		/* write the last node */
+		buf[0] = cpu_to_le32(last_startbit);
+		rc = put_entry(buf, sizeof(u32), 1, fp);
+		if (rc)
+			return rc;
+
+		buf64[0] = cpu_to_le64(map);
+		rc = put_entry(buf64, sizeof(u64), 1, fp);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h
index f283b4367f5..1f4e93c2ae8 100644
--- a/security/selinux/ss/ebitmap.h
+++ b/security/selinux/ss/ebitmap.h
@@ -123,6 +123,7 @@ int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
 int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
 void ebitmap_destroy(struct ebitmap *e);
 int ebitmap_read(struct ebitmap *e, void *fp);
+int ebitmap_write(struct ebitmap *e, void *fp);
 
 #ifdef CONFIG_NETLABEL
 int ebitmap_netlbl_export(struct ebitmap *ebmap,
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 3a29704be8c..94f630d93a5 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -37,6 +37,7 @@
 #include "policydb.h"
 #include "conditional.h"
 #include "mls.h"
+#include "services.h"
 
 #define _DEBUG_HASHES
 
@@ -185,9 +186,19 @@ static u32 rangetr_hash(struct hashtab *h, const void *k)
 static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2)
 {
 	const struct range_trans *key1 = k1, *key2 = k2;
-	return (key1->source_type != key2->source_type ||
-		key1->target_type != key2->target_type ||
-		key1->target_class != key2->target_class);
+	int v;
+
+	v = key1->source_type - key2->source_type;
+	if (v)
+		return v;
+
+	v = key1->target_type - key2->target_type;
+	if (v)
+		return v;
+
+	v = key1->target_class - key2->target_class;
+
+	return v;
 }
 
 /*
@@ -1624,11 +1635,11 @@ static int role_bounds_sanity_check(void *key, void *datum, void *datap)
 
 static int type_bounds_sanity_check(void *key, void *datum, void *datap)
 {
-	struct type_datum *upper, *type;
+	struct type_datum *upper;
 	struct policydb *p = datap;
 	int depth = 0;
 
-	upper = type = datum;
+	upper = datum;
 	while (upper->bounds) {
 		if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
 			printk(KERN_ERR "SELinux: type %s: "
@@ -2306,3 +2317,843 @@ bad:
 	policydb_destroy(p);
 	goto out;
 }
+
+/*
+ * Write a MLS level structure to a policydb binary
+ * representation file.
+ */
+static int mls_write_level(struct mls_level *l, void *fp)
+{
+	__le32 buf[1];
+	int rc;
+
+	buf[0] = cpu_to_le32(l->sens);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	rc = ebitmap_write(&l->cat, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+/*
+ * Write a MLS range structure to a policydb binary
+ * representation file.
+ */
+static int mls_write_range_helper(struct mls_range *r, void *fp)
+{
+	__le32 buf[3];
+	size_t items;
+	int rc, eq;
+
+	eq = mls_level_eq(&r->level[1], &r->level[0]);
+
+	if (eq)
+		items = 2;
+	else
+		items = 3;
+	buf[0] = cpu_to_le32(items-1);
+	buf[1] = cpu_to_le32(r->level[0].sens);
+	if (!eq)
+		buf[2] = cpu_to_le32(r->level[1].sens);
+
+	BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+
+	rc = put_entry(buf, sizeof(u32), items, fp);
+	if (rc)
+		return rc;
+
+	rc = ebitmap_write(&r->level[0].cat, fp);
+	if (rc)
+		return rc;
+	if (!eq) {
+		rc = ebitmap_write(&r->level[1].cat, fp);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int sens_write(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct level_datum *levdatum = datum;
+	struct policy_data *pd = ptr;
+	void *fp = pd->fp;
+	__le32 buf[2];
+	size_t len;
+	int rc;
+
+	len = strlen(key);
+	buf[0] = cpu_to_le32(len);
+	buf[1] = cpu_to_le32(levdatum->isalias);
+	rc = put_entry(buf, sizeof(u32), 2, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	rc = mls_write_level(levdatum->level, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int cat_write(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct cat_datum *catdatum = datum;
+	struct policy_data *pd = ptr;
+	void *fp = pd->fp;
+	__le32 buf[3];
+	size_t len;
+	int rc;
+
+	len = strlen(key);
+	buf[0] = cpu_to_le32(len);
+	buf[1] = cpu_to_le32(catdatum->value);
+	buf[2] = cpu_to_le32(catdatum->isalias);
+	rc = put_entry(buf, sizeof(u32), 3, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int role_trans_write(struct role_trans *r, void *fp)
+{
+	struct role_trans *tr;
+	u32 buf[3];
+	size_t nel;
+	int rc;
+
+	nel = 0;
+	for (tr = r; tr; tr = tr->next)
+		nel++;
+	buf[0] = cpu_to_le32(nel);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+	for (tr = r; tr; tr = tr->next) {
+		buf[0] = cpu_to_le32(tr->role);
+		buf[1] = cpu_to_le32(tr->type);
+		buf[2] = cpu_to_le32(tr->new_role);
+		rc = put_entry(buf, sizeof(u32), 3, fp);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int role_allow_write(struct role_allow *r, void *fp)
+{
+	struct role_allow *ra;
+	u32 buf[2];
+	size_t nel;
+	int rc;
+
+	nel = 0;
+	for (ra = r; ra; ra = ra->next)
+		nel++;
+	buf[0] = cpu_to_le32(nel);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+	for (ra = r; ra; ra = ra->next) {
+		buf[0] = cpu_to_le32(ra->role);
+		buf[1] = cpu_to_le32(ra->new_role);
+		rc = put_entry(buf, sizeof(u32), 2, fp);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+/*
+ * Write a security context structure
+ * to a policydb binary representation file.
+ */
+static int context_write(struct policydb *p, struct context *c,
+			 void *fp)
+{
+	int rc;
+	__le32 buf[3];
+
+	buf[0] = cpu_to_le32(c->user);
+	buf[1] = cpu_to_le32(c->role);
+	buf[2] = cpu_to_le32(c->type);
+
+	rc = put_entry(buf, sizeof(u32), 3, fp);
+	if (rc)
+		return rc;
+
+	rc = mls_write_range_helper(&c->range, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+/*
+ * The following *_write functions are used to
+ * write the symbol data to a policy database
+ * binary representation file.
+ */
+
+static int perm_write(void *vkey, void *datum, void *fp)
+{
+	char *key = vkey;
+	struct perm_datum *perdatum = datum;
+	__le32 buf[2];
+	size_t len;
+	int rc;
+
+	len = strlen(key);
+	buf[0] = cpu_to_le32(len);
+	buf[1] = cpu_to_le32(perdatum->value);
+	rc = put_entry(buf, sizeof(u32), 2, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int common_write(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct common_datum *comdatum = datum;
+	struct policy_data *pd = ptr;
+	void *fp = pd->fp;
+	__le32 buf[4];
+	size_t len;
+	int rc;
+
+	len = strlen(key);
+	buf[0] = cpu_to_le32(len);
+	buf[1] = cpu_to_le32(comdatum->value);
+	buf[2] = cpu_to_le32(comdatum->permissions.nprim);
+	buf[3] = cpu_to_le32(comdatum->permissions.table->nel);
+	rc = put_entry(buf, sizeof(u32), 4, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	rc = hashtab_map(comdatum->permissions.table, perm_write, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int write_cons_helper(struct policydb *p, struct constraint_node *node,
+			     void *fp)
+{
+	struct constraint_node *c;
+	struct constraint_expr *e;
+	__le32 buf[3];
+	u32 nel;
+	int rc;
+
+	for (c = node; c; c = c->next) {
+		nel = 0;
+		for (e = c->expr; e; e = e->next)
+			nel++;
+		buf[0] = cpu_to_le32(c->permissions);
+		buf[1] = cpu_to_le32(nel);
+		rc = put_entry(buf, sizeof(u32), 2, fp);
+		if (rc)
+			return rc;
+		for (e = c->expr; e; e = e->next) {
+			buf[0] = cpu_to_le32(e->expr_type);
+			buf[1] = cpu_to_le32(e->attr);
+			buf[2] = cpu_to_le32(e->op);
+			rc = put_entry(buf, sizeof(u32), 3, fp);
+			if (rc)
+				return rc;
+
+			switch (e->expr_type) {
+			case CEXPR_NAMES:
+				rc = ebitmap_write(&e->names, fp);
+				if (rc)
+					return rc;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int class_write(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct class_datum *cladatum = datum;
+	struct policy_data *pd = ptr;
+	void *fp = pd->fp;
+	struct policydb *p = pd->p;
+	struct constraint_node *c;
+	__le32 buf[6];
+	u32 ncons;
+	size_t len, len2;
+	int rc;
+
+	len = strlen(key);
+	if (cladatum->comkey)
+		len2 = strlen(cladatum->comkey);
+	else
+		len2 = 0;
+
+	ncons = 0;
+	for (c = cladatum->constraints; c; c = c->next)
+		ncons++;
+
+	buf[0] = cpu_to_le32(len);
+	buf[1] = cpu_to_le32(len2);
+	buf[2] = cpu_to_le32(cladatum->value);
+	buf[3] = cpu_to_le32(cladatum->permissions.nprim);
+	if (cladatum->permissions.table)
+		buf[4] = cpu_to_le32(cladatum->permissions.table->nel);
+	else
+		buf[4] = 0;
+	buf[5] = cpu_to_le32(ncons);
+	rc = put_entry(buf, sizeof(u32), 6, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	if (cladatum->comkey) {
+		rc = put_entry(cladatum->comkey, 1, len2, fp);
+		if (rc)
+			return rc;
+	}
+
+	rc = hashtab_map(cladatum->permissions.table, perm_write, fp);
+	if (rc)
+		return rc;
+
+	rc = write_cons_helper(p, cladatum->constraints, fp);
+	if (rc)
+		return rc;
+
+	/* write out the validatetrans rule */
+	ncons = 0;
+	for (c = cladatum->validatetrans; c; c = c->next)
+		ncons++;
+
+	buf[0] = cpu_to_le32(ncons);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	rc = write_cons_helper(p, cladatum->validatetrans, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int role_write(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct role_datum *role = datum;
+	struct policy_data *pd = ptr;
+	void *fp = pd->fp;
+	struct policydb *p = pd->p;
+	__le32 buf[3];
+	size_t items, len;
+	int rc;
+
+	len = strlen(key);
+	items = 0;
+	buf[items++] = cpu_to_le32(len);
+	buf[items++] = cpu_to_le32(role->value);
+	if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
+		buf[items++] = cpu_to_le32(role->bounds);
+
+	BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+
+	rc = put_entry(buf, sizeof(u32), items, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	rc = ebitmap_write(&role->dominates, fp);
+	if (rc)
+		return rc;
+
+	rc = ebitmap_write(&role->types, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int type_write(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct type_datum *typdatum = datum;
+	struct policy_data *pd = ptr;
+	struct policydb *p = pd->p;
+	void *fp = pd->fp;
+	__le32 buf[4];
+	int rc;
+	size_t items, len;
+
+	len = strlen(key);
+	items = 0;
+	buf[items++] = cpu_to_le32(len);
+	buf[items++] = cpu_to_le32(typdatum->value);
+	if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
+		u32 properties = 0;
+
+		if (typdatum->primary)
+			properties |= TYPEDATUM_PROPERTY_PRIMARY;
+
+		if (typdatum->attribute)
+			properties |= TYPEDATUM_PROPERTY_ATTRIBUTE;
+
+		buf[items++] = cpu_to_le32(properties);
+		buf[items++] = cpu_to_le32(typdatum->bounds);
+	} else {
+		buf[items++] = cpu_to_le32(typdatum->primary);
+	}
+	BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+	rc = put_entry(buf, sizeof(u32), items, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int user_write(void *vkey, void *datum, void *ptr)
+{
+	char *key = vkey;
+	struct user_datum *usrdatum = datum;
+	struct policy_data *pd = ptr;
+	struct policydb *p = pd->p;
+	void *fp = pd->fp;
+	__le32 buf[3];
+	size_t items, len;
+	int rc;
+
+	len = strlen(key);
+	items = 0;
+	buf[items++] = cpu_to_le32(len);
+	buf[items++] = cpu_to_le32(usrdatum->value);
+	if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
+		buf[items++] = cpu_to_le32(usrdatum->bounds);
+	BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+	rc = put_entry(buf, sizeof(u32), items, fp);
+	if (rc)
+		return rc;
+
+	rc = put_entry(key, 1, len, fp);
+	if (rc)
+		return rc;
+
+	rc = ebitmap_write(&usrdatum->roles, fp);
+	if (rc)
+		return rc;
+
+	rc = mls_write_range_helper(&usrdatum->range, fp);
+	if (rc)
+		return rc;
+
+	rc = mls_write_level(&usrdatum->dfltlevel, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int (*write_f[SYM_NUM]) (void *key, void *datum,
+				void *datap) =
+{
+	common_write,
+	class_write,
+	role_write,
+	type_write,
+	user_write,
+	cond_write_bool,
+	sens_write,
+	cat_write,
+};
+
+static int ocontext_write(struct policydb *p, struct policydb_compat_info *info,
+			  void *fp)
+{
+	unsigned int i, j, rc;
+	size_t nel, len;
+	__le32 buf[3];
+	u32 nodebuf[8];
+	struct ocontext *c;
+	for (i = 0; i < info->ocon_num; i++) {
+		nel = 0;
+		for (c = p->ocontexts[i]; c; c = c->next)
+			nel++;
+		buf[0] = cpu_to_le32(nel);
+		rc = put_entry(buf, sizeof(u32), 1, fp);
+		if (rc)
+			return rc;
+		for (c = p->ocontexts[i]; c; c = c->next) {
+			switch (i) {
+			case OCON_ISID:
+				buf[0] = cpu_to_le32(c->sid[0]);
+				rc = put_entry(buf, sizeof(u32), 1, fp);
+				if (rc)
+					return rc;
+				rc = context_write(p, &c->context[0], fp);
+				if (rc)
+					return rc;
+				break;
+			case OCON_FS:
+			case OCON_NETIF:
+				len = strlen(c->u.name);
+				buf[0] = cpu_to_le32(len);
+				rc = put_entry(buf, sizeof(u32), 1, fp);
+				if (rc)
+					return rc;
+				rc = put_entry(c->u.name, 1, len, fp);
+				if (rc)
+					return rc;
+				rc = context_write(p, &c->context[0], fp);
+				if (rc)
+					return rc;
+				rc = context_write(p, &c->context[1], fp);
+				if (rc)
+					return rc;
+				break;
+			case OCON_PORT:
+				buf[0] = cpu_to_le32(c->u.port.protocol);
+				buf[1] = cpu_to_le32(c->u.port.low_port);
+				buf[2] = cpu_to_le32(c->u.port.high_port);
+				rc = put_entry(buf, sizeof(u32), 3, fp);
+				if (rc)
+					return rc;
+				rc = context_write(p, &c->context[0], fp);
+				if (rc)
+					return rc;
+				break;
+			case OCON_NODE:
+				nodebuf[0] = c->u.node.addr; /* network order */
+				nodebuf[1] = c->u.node.mask; /* network order */
+				rc = put_entry(nodebuf, sizeof(u32), 2, fp);
+				if (rc)
+					return rc;
+				rc = context_write(p, &c->context[0], fp);
+				if (rc)
+					return rc;
+				break;
+			case OCON_FSUSE:
+				buf[0] = cpu_to_le32(c->v.behavior);
+				len = strlen(c->u.name);
+				buf[1] = cpu_to_le32(len);
+				rc = put_entry(buf, sizeof(u32), 2, fp);
+				if (rc)
+					return rc;
+				rc = put_entry(c->u.name, 1, len, fp);
+				if (rc)
+					return rc;
+				rc = context_write(p, &c->context[0], fp);
+				if (rc)
+					return rc;
+				break;
+			case OCON_NODE6:
+				for (j = 0; j < 4; j++)
+					nodebuf[j] = c->u.node6.addr[j]; /* network order */
+				for (j = 0; j < 4; j++)
+					nodebuf[j + 4] = c->u.node6.mask[j]; /* network order */
+				rc = put_entry(nodebuf, sizeof(u32), 8, fp);
+				if (rc)
+					return rc;
+				rc = context_write(p, &c->context[0], fp);
+				if (rc)
+					return rc;
+				break;
+			}
+		}
+	}
+	return 0;
+}
+
+static int genfs_write(struct policydb *p, void *fp)
+{
+	struct genfs *genfs;
+	struct ocontext *c;
+	size_t len;
+	__le32 buf[1];
+	int rc;
+
+	len = 0;
+	for (genfs = p->genfs; genfs; genfs = genfs->next)
+		len++;
+	buf[0] = cpu_to_le32(len);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+	for (genfs = p->genfs; genfs; genfs = genfs->next) {
+		len = strlen(genfs->fstype);
+		buf[0] = cpu_to_le32(len);
+		rc = put_entry(buf, sizeof(u32), 1, fp);
+		if (rc)
+			return rc;
+		rc = put_entry(genfs->fstype, 1, len, fp);
+		if (rc)
+			return rc;
+		len = 0;
+		for (c = genfs->head; c; c = c->next)
+			len++;
+		buf[0] = cpu_to_le32(len);
+		rc = put_entry(buf, sizeof(u32), 1, fp);
+		if (rc)
+			return rc;
+		for (c = genfs->head; c; c = c->next) {
+			len = strlen(c->u.name);
+			buf[0] = cpu_to_le32(len);
+			rc = put_entry(buf, sizeof(u32), 1, fp);
+			if (rc)
+				return rc;
+			rc = put_entry(c->u.name, 1, len, fp);
+			if (rc)
+				return rc;
+			buf[0] = cpu_to_le32(c->v.sclass);
+			rc = put_entry(buf, sizeof(u32), 1, fp);
+			if (rc)
+				return rc;
+			rc = context_write(p, &c->context[0], fp);
+			if (rc)
+				return rc;
+		}
+	}
+	return 0;
+}
+
+static int range_count(void *key, void *data, void *ptr)
+{
+	int *cnt = ptr;
+	*cnt = *cnt + 1;
+
+	return 0;
+}
+
+static int range_write_helper(void *key, void *data, void *ptr)
+{
+	__le32 buf[2];
+	struct range_trans *rt = key;
+	struct mls_range *r = data;
+	struct policy_data *pd = ptr;
+	void *fp = pd->fp;
+	struct policydb *p = pd->p;
+	int rc;
+
+	buf[0] = cpu_to_le32(rt->source_type);
+	buf[1] = cpu_to_le32(rt->target_type);
+	rc = put_entry(buf, sizeof(u32), 2, fp);
+	if (rc)
+		return rc;
+	if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
+		buf[0] = cpu_to_le32(rt->target_class);
+		rc = put_entry(buf, sizeof(u32), 1, fp);
+		if (rc)
+			return rc;
+	}
+	rc = mls_write_range_helper(r, fp);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int range_write(struct policydb *p, void *fp)
+{
+	size_t nel;
+	__le32 buf[1];
+	int rc;
+	struct policy_data pd;
+
+	pd.p = p;
+	pd.fp = fp;
+
+	/* count the number of entries in the hashtab */
+	nel = 0;
+	rc = hashtab_map(p->range_tr, range_count, &nel);
+	if (rc)
+		return rc;
+
+	buf[0] = cpu_to_le32(nel);
+	rc = put_entry(buf, sizeof(u32), 1, fp);
+	if (rc)
+		return rc;
+
+	/* actually write all of the entries */
+	rc = hashtab_map(p->range_tr, range_write_helper, &pd);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+/*
+ * Write the configuration data in a policy database
+ * structure to a policy database binary representation
+ * file.
+ */
+int policydb_write(struct policydb *p, void *fp)
+{
+	unsigned int i, num_syms;
+	int rc;
+	__le32 buf[4];
+	u32 config;
+	size_t len;
+	struct policydb_compat_info *info;
+
+	/*
+	 * refuse to write policy older than compressed avtab
+	 * to simplify the writer.  There are other tests dropped
+	 * since we assume this throughout the writer code.  Be
+	 * careful if you ever try to remove this restriction
+	 */
+	if (p->policyvers < POLICYDB_VERSION_AVTAB) {
+		printk(KERN_ERR "SELinux: refusing to write policy version %d."
+		       "  Because it is less than version %d\n", p->policyvers,
+		       POLICYDB_VERSION_AVTAB);
+		return -EINVAL;
+	}
+
+	config = 0;
+	if (p->mls_enabled)
+		config |= POLICYDB_CONFIG_MLS;
+
+	if (p->reject_unknown)
+		config |= REJECT_UNKNOWN;
+	if (p->allow_unknown)
+		config |= ALLOW_UNKNOWN;
+
+	/* Write the magic number and string identifiers. */
+	buf[0] = cpu_to_le32(POLICYDB_MAGIC);
+	len = strlen(POLICYDB_STRING);
+	buf[1] = cpu_to_le32(len);
+	rc = put_entry(buf, sizeof(u32), 2, fp);
+	if (rc)
+		return rc;
+	rc = put_entry(POLICYDB_STRING, 1, len, fp);
+	if (rc)
+		return rc;
+
+	/* Write the version, config, and table sizes. */
+	info = policydb_lookup_compat(p->policyvers);
+	if (!info) {
+		printk(KERN_ERR "SELinux: compatibility lookup failed for policy "
+		    "version %d", p->policyvers);
+		return rc;
+	}
+
+	buf[0] = cpu_to_le32(p->policyvers);
+	buf[1] = cpu_to_le32(config);
+	buf[2] = cpu_to_le32(info->sym_num);
+	buf[3] = cpu_to_le32(info->ocon_num);
+
+	rc = put_entry(buf, sizeof(u32), 4, fp);
+	if (rc)
+		return rc;
+
+	if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
+		rc = ebitmap_write(&p->policycaps, fp);
+		if (rc)
+			return rc;
+	}
+
+	if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
+		rc = ebitmap_write(&p->permissive_map, fp);
+		if (rc)
+			return rc;
+	}
+
+	num_syms = info->sym_num;
+	for (i = 0; i < num_syms; i++) {
+		struct policy_data pd;
+
+		pd.fp = fp;
+		pd.p = p;
+
+		buf[0] = cpu_to_le32(p->symtab[i].nprim);
+		buf[1] = cpu_to_le32(p->symtab[i].table->nel);
+
+		rc = put_entry(buf, sizeof(u32), 2, fp);
+		if (rc)
+			return rc;
+		rc = hashtab_map(p->symtab[i].table, write_f[i], &pd);
+		if (rc)
+			return rc;
+	}
+
+	rc = avtab_write(p, &p->te_avtab, fp);
+	if (rc)
+		return rc;
+
+	rc = cond_write_list(p, p->cond_list, fp);
+	if (rc)
+		return rc;
+
+	rc = role_trans_write(p->role_tr, fp);
+	if (rc)
+		return rc;
+
+	rc = role_allow_write(p->role_allow, fp);
+	if (rc)
+		return rc;
+
+	rc = ocontext_write(p, info, fp);
+	if (rc)
+		return rc;
+
+	rc = genfs_write(p, fp);
+	if (rc)
+		return rc;
+
+	rc = range_write(p, fp);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < p->p_types.nprim; i++) {
+		struct ebitmap *e = flex_array_get(p->type_attr_map_array, i);
+
+		BUG_ON(!e);
+		rc = ebitmap_write(e, fp);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index 310e94442cb..95d3d7de361 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -254,6 +254,9 @@ struct policydb {
 
 	struct ebitmap permissive_map;
 
+	/* length of this policy when it was loaded */
+	size_t len;
+
 	unsigned int policyvers;
 
 	unsigned int reject_unknown : 1;
@@ -270,6 +273,7 @@ extern int policydb_class_isvalid(struct policydb *p, unsigned int class);
 extern int policydb_type_isvalid(struct policydb *p, unsigned int type);
 extern int policydb_role_isvalid(struct policydb *p, unsigned int role);
 extern int policydb_read(struct policydb *p, void *fp);
+extern int policydb_write(struct policydb *p, void *fp);
 
 #define PERM_SYMTAB_SIZE 32
 
@@ -290,6 +294,11 @@ struct policy_file {
 	size_t len;
 };
 
+struct policy_data {
+	struct policydb *p;
+	void *fp;
+};
+
 static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
 {
 	if (bytes > fp->len)
@@ -301,6 +310,17 @@ static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
 	return 0;
 }
 
+static inline int put_entry(void *buf, size_t bytes, int num, struct policy_file *fp)
+{
+	size_t len = bytes * num;
+
+	memcpy(fp->data, buf, len);
+	fp->data += len;
+	fp->len -= len;
+
+	return 0;
+}
+
 extern u16 string_to_security_class(struct policydb *p, const char *name);
 extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name);
 
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 9ea2feca3cd..223c1ff6ef2 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -51,6 +51,7 @@
 #include <linux/mutex.h>
 #include <linux/selinux.h>
 #include <linux/flex_array.h>
+#include <linux/vmalloc.h>
 #include <net/netlabel.h>
 
 #include "flask.h"
@@ -991,7 +992,8 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
 {
 	char *scontextp;
 
-	*scontext = NULL;
+	if (scontext)
+		*scontext = NULL;
 	*scontext_len = 0;
 
 	if (context->len) {
@@ -1008,6 +1010,9 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
 	*scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1;
 	*scontext_len += mls_compute_context_len(context);
 
+	if (!scontext)
+		return 0;
+
 	/* Allocate space for the context; caller must free this space. */
 	scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
 	if (!scontextp)
@@ -1047,7 +1052,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
 	struct context *context;
 	int rc = 0;
 
-	*scontext = NULL;
+	if (scontext)
+		*scontext = NULL;
 	*scontext_len  = 0;
 
 	if (!ss_initialized) {
@@ -1055,6 +1061,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
 			char *scontextp;
 
 			*scontext_len = strlen(initial_sid_to_string[sid]) + 1;
+			if (!scontext)
+				goto out;
 			scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
 			if (!scontextp) {
 				rc = -ENOMEM;
@@ -1769,6 +1777,7 @@ int security_load_policy(void *data, size_t len)
 			return rc;
 		}
 
+		policydb.len = len;
 		rc = selinux_set_mapping(&policydb, secclass_map,
 					 &current_mapping,
 					 &current_mapping_size);
@@ -1791,6 +1800,7 @@ int security_load_policy(void *data, size_t len)
 		selinux_complete_init();
 		avc_ss_reset(seqno);
 		selnl_notify_policyload(seqno);
+		selinux_status_update_policyload(seqno);
 		selinux_netlbl_cache_invalidate();
 		selinux_xfrm_notify_policyload();
 		return 0;
@@ -1804,6 +1814,7 @@ int security_load_policy(void *data, size_t len)
 	if (rc)
 		return rc;
 
+	newpolicydb.len = len;
 	/* If switching between different policy types, log MLS status */
 	if (policydb.mls_enabled && !newpolicydb.mls_enabled)
 		printk(KERN_INFO "SELinux: Disabling MLS support...\n");
@@ -1870,6 +1881,7 @@ int security_load_policy(void *data, size_t len)
 
 	avc_ss_reset(seqno);
 	selnl_notify_policyload(seqno);
+	selinux_status_update_policyload(seqno);
 	selinux_netlbl_cache_invalidate();
 	selinux_xfrm_notify_policyload();
 
@@ -1883,6 +1895,17 @@ err:
 
 }
 
+size_t security_policydb_len(void)
+{
+	size_t len;
+
+	read_lock(&policy_rwlock);
+	len = policydb.len;
+	read_unlock(&policy_rwlock);
+
+	return len;
+}
+
 /**
  * security_port_sid - Obtain the SID for a port.
  * @protocol: protocol number
@@ -2374,6 +2397,7 @@ out:
 	if (!rc) {
 		avc_ss_reset(seqno);
 		selnl_notify_policyload(seqno);
+		selinux_status_update_policyload(seqno);
 		selinux_xfrm_notify_policyload();
 	}
 	return rc;
@@ -3129,3 +3153,38 @@ netlbl_sid_to_secattr_failure:
 	return rc;
 }
 #endif /* CONFIG_NETLABEL */
+
+/**
+ * security_read_policy - read the policy.
+ * @data: binary policy data
+ * @len: length of data in bytes
+ *
+ */
+int security_read_policy(void **data, ssize_t *len)
+{
+	int rc;
+	struct policy_file fp;
+
+	if (!ss_initialized)
+		return -EINVAL;
+
+	*len = security_policydb_len();
+
+	*data = vmalloc_user(*len);
+	if (!*data)
+		return -ENOMEM;
+
+	fp.data = *data;
+	fp.len = *len;
+
+	read_lock(&policy_rwlock);
+	rc = policydb_write(&policydb, &fp);
+	read_unlock(&policy_rwlock);
+
+	if (rc)
+		return rc;
+
+	*len = (unsigned long)fp.data - (unsigned long)*data;
+	return 0;
+
+}
diff --git a/security/selinux/ss/status.c b/security/selinux/ss/status.c
new file mode 100644
index 00000000000..d982365f9d1
--- /dev/null
+++ b/security/selinux/ss/status.c
@@ -0,0 +1,126 @@
+/*
+ * mmap based event notifications for SELinux
+ *
+ * Author: KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * Copyright (C) 2010 NEC corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include "avc.h"
+#include "services.h"
+
+/*
+ * The selinux_status_page shall be exposed to userspace applications
+ * using mmap interface on /selinux/status.
+ * It enables to notify applications a few events that will cause reset
+ * of userspace access vector without context switching.
+ *
+ * The selinux_kernel_status structure on the head of status page is
+ * protected from concurrent accesses using seqlock logic, so userspace
+ * application should reference the status page according to the seqlock
+ * logic.
+ *
+ * Typically, application checks status->sequence at the head of access
+ * control routine. If it is odd-number, kernel is updating the status,
+ * so please wait for a moment. If it is changed from the last sequence
+ * number, it means something happen, so application will reset userspace
+ * avc, if needed.
+ * In most cases, application shall confirm the kernel status is not
+ * changed without any system call invocations.
+ */
+static struct page *selinux_status_page;
+static DEFINE_MUTEX(selinux_status_lock);
+
+/*
+ * selinux_kernel_status_page
+ *
+ * It returns a reference to selinux_status_page. If the status page is
+ * not allocated yet, it also tries to allocate it at the first time.
+ */
+struct page *selinux_kernel_status_page(void)
+{
+	struct selinux_kernel_status   *status;
+	struct page		       *result = NULL;
+
+	mutex_lock(&selinux_status_lock);
+	if (!selinux_status_page) {
+		selinux_status_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+
+		if (selinux_status_page) {
+			status = page_address(selinux_status_page);
+
+			status->version = SELINUX_KERNEL_STATUS_VERSION;
+			status->sequence = 0;
+			status->enforcing = selinux_enforcing;
+			/*
+			 * NOTE: the next policyload event shall set
+			 * a positive value on the status->policyload,
+			 * although it may not be 1, but never zero.
+			 * So, application can know it was updated.
+			 */
+			status->policyload = 0;
+			status->deny_unknown = !security_get_allow_unknown();
+		}
+	}
+	result = selinux_status_page;
+	mutex_unlock(&selinux_status_lock);
+
+	return result;
+}
+
+/*
+ * selinux_status_update_setenforce
+ *
+ * It updates status of the current enforcing/permissive mode.
+ */
+void selinux_status_update_setenforce(int enforcing)
+{
+	struct selinux_kernel_status   *status;
+
+	mutex_lock(&selinux_status_lock);
+	if (selinux_status_page) {
+		status = page_address(selinux_status_page);
+
+		status->sequence++;
+		smp_wmb();
+
+		status->enforcing = enforcing;
+
+		smp_wmb();
+		status->sequence++;
+	}
+	mutex_unlock(&selinux_status_lock);
+}
+
+/*
+ * selinux_status_update_policyload
+ *
+ * It updates status of the times of policy reloaded, and current
+ * setting of deny_unknown.
+ */
+void selinux_status_update_policyload(int seqno)
+{
+	struct selinux_kernel_status   *status;
+
+	mutex_lock(&selinux_status_lock);
+	if (selinux_status_page) {
+		status = page_address(selinux_status_page);
+
+		status->sequence++;
+		smp_wmb();
+
+		status->policyload = seqno;
+		status->deny_unknown = !security_get_allow_unknown();
+
+		smp_wmb();
+		status->sequence++;
+	}
+	mutex_unlock(&selinux_status_lock);
+}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index c448d57ae2b..bc39f4067af 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1281,12 +1281,11 @@ static int smack_task_getioprio(struct task_struct *p)
  *
  * Return 0 if read access is permitted
  */
-static int smack_task_setscheduler(struct task_struct *p, int policy,
-				   struct sched_param *lp)
+static int smack_task_setscheduler(struct task_struct *p)
 {
 	int rc;
 
-	rc = cap_task_setscheduler(p, policy, lp);
+	rc = cap_task_setscheduler(p);
 	if (rc == 0)
 		rc = smk_curacc_on_task(p, MAY_WRITE);
 	return rc;
@@ -3005,7 +3004,8 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
 	char *sp = smack_from_secid(secid);
 
-	*secdata = sp;
+	if (secdata)
+		*secdata = sp;
 	*seclen = strlen(sp);
 	return 0;
 }
diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index c668b447c72..7556315c197 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -768,8 +768,10 @@ static bool tomoyo_select_one(struct tomoyo_io_buffer *head, const char *data)
 		return true; /* Do nothing if open(O_WRONLY). */
 	memset(&head->r, 0, sizeof(head->r));
 	head->r.print_this_domain_only = true;
-	head->r.eof = !domain;
-	head->r.domain = &domain->list;
+	if (domain)
+		head->r.domain = &domain->list;
+	else
+		head->r.eof = 1;
 	tomoyo_io_printf(head, "# select %s\n", data);
 	if (domain && domain->is_deleted)
 		tomoyo_io_printf(head, "# This is a deleted domain.\n");
@@ -2051,13 +2053,22 @@ void tomoyo_check_profile(void)
 		const u8 profile = domain->profile;
 		if (tomoyo_profile_ptr[profile])
 			continue;
+		printk(KERN_ERR "You need to define profile %u before using it.\n",
+		       profile);
+		printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
+		       "for more information.\n");
 		panic("Profile %u (used by '%s') not defined.\n",
 		      profile, domain->domainname->name);
 	}
 	tomoyo_read_unlock(idx);
-	if (tomoyo_profile_version != 20090903)
+	if (tomoyo_profile_version != 20090903) {
+		printk(KERN_ERR "You need to install userland programs for "
+		       "TOMOYO 2.3 and initialize policy configuration.\n");
+		printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
+		       "for more information.\n");
 		panic("Profile version %u is not supported.\n",
 		      tomoyo_profile_version);
+	}
 	printk(KERN_INFO "TOMOYO: 2.3.0\n");
 	printk(KERN_INFO "Mandatory Access Control activated.\n");
 }
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index 5164a655c39..b2c63309a65 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -8,7 +8,7 @@ perf-annotate - Read perf.data (created by perf record) and display annotated co
 SYNOPSIS
 --------
 [verse]
-'perf annotate' [-i <file> | --input=file] symbol_name
+'perf annotate' [-i <file> | --input=file] [symbol_name]
 
 DESCRIPTION
 -----------
@@ -24,6 +24,13 @@ OPTIONS
 --input=::
         Input file name. (default: perf.data)
 
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface Use of --tui requires a tty, if one is not
+	present, as when piping to other commands, the stdio interface is
+	used. This interfaces starts by centering on the line with more
+	samples, TAB/UNTAB cycles thru the lines with more samples.
+
 SEE ALSO
 --------
-linkperf:perf-record[1]
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index abfabe9147a..12052c9ed0b 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -65,6 +65,13 @@ OPTIONS
 		 the tree is considered as a new profiled object. +
 	Default: fractal,0.5.
 
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface, that is integrated with annotate and allows
+        zooming into DSOs or threads, among other features. Use of --tui
+	requires a tty, if one is not present, as when piping to other
+	commands, the stdio interface is used.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 1950e19af1c..d1db0f676a4 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -313,6 +313,9 @@ TEST_PROGRAMS =
 
 SCRIPT_SH += perf-archive.sh
 
+grep-libs = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
 #
 # No Perl scripts right now:
 #
@@ -588,14 +591,17 @@ endif
 ifdef NO_LIBPERL
 	BASIC_CFLAGS += -DNO_LIBPERL
 else
-	PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null`
+       PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
+       PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
+       PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
 	PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
 	FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
 	ifneq ($(call try-cc,$(SOURCE_PERL_EMBED),$(FLAGS_PERL_EMBED)),y)
 		BASIC_CFLAGS += -DNO_LIBPERL
 	else
-		ALL_LDFLAGS += $(PERL_EMBED_LDOPTS)
+               ALL_LDFLAGS += $(PERL_EMBED_LDFLAGS)
+               EXTLIBS += $(PERL_EMBED_LIBADD)
 		LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
 		LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
 	endif
@@ -604,13 +610,16 @@ endif
 ifdef NO_LIBPYTHON
 	BASIC_CFLAGS += -DNO_LIBPYTHON
 else
-	PYTHON_EMBED_LDOPTS = `python-config --ldflags 2>/dev/null`
+       PYTHON_EMBED_LDOPTS = $(shell python-config --ldflags 2>/dev/null)
+       PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
+       PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
 	PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null`
 	FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 	ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y)
 		BASIC_CFLAGS += -DNO_LIBPYTHON
 	else
-		ALL_LDFLAGS += $(PYTHON_EMBED_LDOPTS)
+               ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
+               EXTLIBS += $(PYTHON_EMBED_LIBADD)
 		LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
 		LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
 	endif
@@ -653,6 +662,15 @@ else
 	endif
 endif
 
+
+ifdef NO_STRLCPY
+	BASIC_CFLAGS += -DNO_STRLCPY
+else
+	ifneq ($(call try-cc,$(SOURCE_STRLCPY),),y)
+		BASIC_CFLAGS += -DNO_STRLCPY
+	endif
+endif
+
 ifndef CC_LD_DYNPATH
 	ifdef NO_R_TO_GCC_LINKER
 		# Some gcc does not accept and pass -R to the linker to specify
@@ -910,8 +928,8 @@ $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
 		$(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
 
 $(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(OUTPUT)perf.o \
-		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \
+               $(BUILTIN_OBJS) $(LIBS) -o $@
 
 $(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 1478dc64bf1..6d5604d8df9 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -28,7 +28,7 @@
 
 static char		const *input_name = "perf.data";
 
-static bool		force;
+static bool		force, use_tui, use_stdio;
 
 static bool		full_paths;
 
@@ -321,7 +321,7 @@ static int hist_entry__tty_annotate(struct hist_entry *he)
 
 static void hists__find_annotations(struct hists *self)
 {
-	struct rb_node *first = rb_first(&self->entries), *nd = first;
+	struct rb_node *nd = rb_first(&self->entries), *next;
 	int key = KEY_RIGHT;
 
 	while (nd) {
@@ -343,20 +343,19 @@ find_next:
 
 		if (use_browser > 0) {
 			key = hist_entry__tui_annotate(he);
-			if (is_exit_key(key))
-				break;
 			switch (key) {
 			case KEY_RIGHT:
-			case '\t':
-				nd = rb_next(nd);
+				next = rb_next(nd);
 				break;
 			case KEY_LEFT:
-				if (nd == first)
-					continue;
-				nd = rb_prev(nd);
-			default:
+				next = rb_prev(nd);
 				break;
+			default:
+				return;
 			}
+
+			if (next != NULL)
+				nd = next;
 		} else {
 			hist_entry__tty_annotate(he);
 			nd = rb_next(nd);
@@ -428,6 +427,8 @@ static const struct option options[] = {
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
+	OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
@@ -443,6 +444,11 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
 {
 	argc = parse_options(argc, argv, options, annotate_usage, 0);
 
+	if (use_stdio)
+		use_browser = 0;
+	else if (use_tui)
+		use_browser = 1;
+
 	setup_browser();
 
 	symbol_conf.priv_size = sizeof(struct sym_priv);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 55fc1f46892..5de405d4523 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -32,7 +32,7 @@
 
 static char		const *input_name = "perf.data";
 
-static bool		force;
+static bool		force, use_tui, use_stdio;
 static bool		hide_unresolved;
 static bool		dont_use_callchains;
 
@@ -107,7 +107,8 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 		goto out_free_syms;
 	err = 0;
 	if (symbol_conf.use_callchain) {
-		err = append_chain(he->callchain, data->callchain, syms, data->period);
+		err = callchain_append(he->callchain, data->callchain, syms,
+				       data->period);
 		if (err)
 			goto out_free_syms;
 	}
@@ -450,6 +451,8 @@ static const struct option options[] = {
 		    "Show per-thread event counters"),
 	OPT_STRING(0, "pretty", &pretty_printing_style, "key",
 		   "pretty printing style key: normal raw"),
+	OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
+	OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): pid, comm, dso, symbol, parent"),
 	OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
@@ -482,8 +485,15 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 {
 	argc = parse_options(argc, argv, options, report_usage, 0);
 
+	if (use_stdio)
+		use_browser = 0;
+	else if (use_tui)
+		use_browser = 1;
+
 	if (strcmp(input_name, "-") != 0)
 		setup_browser();
+	else
+		use_browser = 0;
 	/*
 	 * Only in the newt browser we are doing integrated annotation,
 	 * so don't allocate extra space that won't be used in the stdio
diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak
index 7a7b6085905..b253db634f0 100644
--- a/tools/perf/feature-tests.mak
+++ b/tools/perf/feature-tests.mak
@@ -110,6 +110,17 @@ int main(void)
 }
 endef
 
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+	strlcpy(NULL, NULL, 0);
+	return 0;
+}
+endef
+
 # try-cc
 # Usage: option = $(call try-cc, source-to-build, cc-options)
 try-cc = $(shell sh -c						  \
diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record
new file mode 100644
index 00000000000..d931a828126
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-record
@@ -0,0 +1,8 @@
+#!/bin/bash
+perf record -a -e net:net_dev_xmit -e net:net_dev_queue		\
+		-e net:netif_receive_skb -e net:netif_rx		\
+		-e skb:consume_skb -e skb:kfree_skb			\
+		-e skb:skb_copy_datagram_iovec -e napi:napi_poll	\
+		-e irq:irq_handler_entry -e irq:irq_handler_exit	\
+		-e irq:softirq_entry -e irq:softirq_exit		\
+		-e irq:softirq_raise $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report
new file mode 100644
index 00000000000..c3d0a638123
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: display a process of packet and processing time
+# args: [tx] [rx] [dev=] [debug]
+
+perf trace -s ~/libexec/perf-core/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
new file mode 100644
index 00000000000..9aa0a32972e
--- /dev/null
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -0,0 +1,464 @@
+# Display a process of packets and processed time.
+# It helps us to investigate networking or network device.
+#
+# options
+# tx: show only tx chart
+# rx: show only rx chart
+# dev=: show only thing related to specified device
+# debug: work with debug mode. It shows buffer status.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+all_event_list = []; # insert all tracepoint event related with this script
+irq_dic = {}; # key is cpu and value is a list which stacks irqs
+              # which raise NET_RX softirq
+net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry
+		 # and a list which stacks receive
+receive_hunk_list = []; # a list which include a sequence of receive events
+rx_skb_list = []; # received packet list for matching
+		       # skb_copy_datagram_iovec
+
+buffer_budget = 65536; # the budget of rx_skb_list, tx_queue_list and
+		       # tx_xmit_list
+of_count_rx_skb_list = 0; # overflow count
+
+tx_queue_list = []; # list of packets which pass through dev_queue_xmit
+of_count_tx_queue_list = 0; # overflow count
+
+tx_xmit_list = [];  # list of packets which pass through dev_hard_start_xmit
+of_count_tx_xmit_list = 0; # overflow count
+
+tx_free_list = [];  # list of packets which is freed
+
+# options
+show_tx = 0;
+show_rx = 0;
+dev = 0; # store a name of device specified by option "dev="
+debug = 0;
+
+# indices of event_info tuple
+EINFO_IDX_NAME=   0
+EINFO_IDX_CONTEXT=1
+EINFO_IDX_CPU=    2
+EINFO_IDX_TIME=   3
+EINFO_IDX_PID=    4
+EINFO_IDX_COMM=   5
+
+# Calculate a time interval(msec) from src(nsec) to dst(nsec)
+def diff_msec(src, dst):
+	return (dst - src) / 1000000.0
+
+# Display a process of transmitting a packet
+def print_transmit(hunk):
+	if dev != 0 and hunk['dev'].find(dev) < 0:
+		return
+	print "%7s %5d %6d.%06dsec %12.3fmsec      %12.3fmsec" % \
+		(hunk['dev'], hunk['len'],
+		nsecs_secs(hunk['queue_t']),
+		nsecs_nsecs(hunk['queue_t'])/1000,
+		diff_msec(hunk['queue_t'], hunk['xmit_t']),
+		diff_msec(hunk['xmit_t'], hunk['free_t']))
+
+# Format for displaying rx packet processing
+PF_IRQ_ENTRY= "  irq_entry(+%.3fmsec irq=%d:%s)"
+PF_SOFT_ENTRY="  softirq_entry(+%.3fmsec)"
+PF_NAPI_POLL= "  napi_poll_exit(+%.3fmsec %s)"
+PF_JOINT=     "         |"
+PF_WJOINT=    "         |            |"
+PF_NET_RECV=  "         |---netif_receive_skb(+%.3fmsec skb=%x len=%d)"
+PF_NET_RX=    "         |---netif_rx(+%.3fmsec skb=%x)"
+PF_CPY_DGRAM= "         |      skb_copy_datagram_iovec(+%.3fmsec %d:%s)"
+PF_KFREE_SKB= "         |      kfree_skb(+%.3fmsec location=%x)"
+PF_CONS_SKB=  "         |      consume_skb(+%.3fmsec)"
+
+# Display a process of received packets and interrputs associated with
+# a NET_RX softirq
+def print_receive(hunk):
+	show_hunk = 0
+	irq_list = hunk['irq_list']
+	cpu = irq_list[0]['cpu']
+	base_t = irq_list[0]['irq_ent_t']
+	# check if this hunk should be showed
+	if dev != 0:
+		for i in range(len(irq_list)):
+			if irq_list[i]['name'].find(dev) >= 0:
+				show_hunk = 1
+				break
+	else:
+		show_hunk = 1
+	if show_hunk == 0:
+		return
+
+	print "%d.%06dsec cpu=%d" % \
+		(nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu)
+	for i in range(len(irq_list)):
+		print PF_IRQ_ENTRY % \
+			(diff_msec(base_t, irq_list[i]['irq_ent_t']),
+			irq_list[i]['irq'], irq_list[i]['name'])
+		print PF_JOINT
+		irq_event_list = irq_list[i]['event_list']
+		for j in range(len(irq_event_list)):
+			irq_event = irq_event_list[j]
+			if irq_event['event'] == 'netif_rx':
+				print PF_NET_RX % \
+					(diff_msec(base_t, irq_event['time']),
+					irq_event['skbaddr'])
+				print PF_JOINT
+	print PF_SOFT_ENTRY % \
+		diff_msec(base_t, hunk['sirq_ent_t'])
+	print PF_JOINT
+	event_list = hunk['event_list']
+	for i in range(len(event_list)):
+		event = event_list[i]
+		if event['event_name'] == 'napi_poll':
+			print PF_NAPI_POLL % \
+			    (diff_msec(base_t, event['event_t']), event['dev'])
+			if i == len(event_list) - 1:
+				print ""
+			else:
+				print PF_JOINT
+		else:
+			print PF_NET_RECV % \
+			    (diff_msec(base_t, event['event_t']), event['skbaddr'],
+				event['len'])
+			if 'comm' in event.keys():
+				print PF_WJOINT
+				print PF_CPY_DGRAM % \
+					(diff_msec(base_t, event['comm_t']),
+					event['pid'], event['comm'])
+			elif 'handle' in event.keys():
+				print PF_WJOINT
+				if event['handle'] == "kfree_skb":
+					print PF_KFREE_SKB % \
+						(diff_msec(base_t,
+						event['comm_t']),
+						event['location'])
+				elif event['handle'] == "consume_skb":
+					print PF_CONS_SKB % \
+						diff_msec(base_t,
+							event['comm_t'])
+			print PF_JOINT
+
+def trace_begin():
+	global show_tx
+	global show_rx
+	global dev
+	global debug
+
+	for i in range(len(sys.argv)):
+		if i == 0:
+			continue
+		arg = sys.argv[i]
+		if arg == 'tx':
+			show_tx = 1
+		elif arg =='rx':
+			show_rx = 1
+		elif arg.find('dev=',0, 4) >= 0:
+			dev = arg[4:]
+		elif arg == 'debug':
+			debug = 1
+	if show_tx == 0  and show_rx == 0:
+		show_tx = 1
+		show_rx = 1
+
+def trace_end():
+	# order all events in time
+	all_event_list.sort(lambda a,b :cmp(a[EINFO_IDX_TIME],
+					    b[EINFO_IDX_TIME]))
+	# process all events
+	for i in range(len(all_event_list)):
+		event_info = all_event_list[i]
+		name = event_info[EINFO_IDX_NAME]
+		if name == 'irq__softirq_exit':
+			handle_irq_softirq_exit(event_info)
+		elif name == 'irq__softirq_entry':
+			handle_irq_softirq_entry(event_info)
+		elif name == 'irq__softirq_raise':
+			handle_irq_softirq_raise(event_info)
+		elif name == 'irq__irq_handler_entry':
+			handle_irq_handler_entry(event_info)
+		elif name == 'irq__irq_handler_exit':
+			handle_irq_handler_exit(event_info)
+		elif name == 'napi__napi_poll':
+			handle_napi_poll(event_info)
+		elif name == 'net__netif_receive_skb':
+			handle_netif_receive_skb(event_info)
+		elif name == 'net__netif_rx':
+			handle_netif_rx(event_info)
+		elif name == 'skb__skb_copy_datagram_iovec':
+			handle_skb_copy_datagram_iovec(event_info)
+		elif name == 'net__net_dev_queue':
+			handle_net_dev_queue(event_info)
+		elif name == 'net__net_dev_xmit':
+			handle_net_dev_xmit(event_info)
+		elif name == 'skb__kfree_skb':
+			handle_kfree_skb(event_info)
+		elif name == 'skb__consume_skb':
+			handle_consume_skb(event_info)
+	# display receive hunks
+	if show_rx:
+		for i in range(len(receive_hunk_list)):
+			print_receive(receive_hunk_list[i])
+	# display transmit hunks
+	if show_tx:
+		print "   dev    len      Qdisc        " \
+			"       netdevice             free"
+		for i in range(len(tx_free_list)):
+			print_transmit(tx_free_list[i])
+	if debug:
+		print "debug buffer status"
+		print "----------------------------"
+		print "xmit Qdisc:remain:%d overflow:%d" % \
+			(len(tx_queue_list), of_count_tx_queue_list)
+		print "xmit netdevice:remain:%d overflow:%d" % \
+			(len(tx_xmit_list), of_count_tx_xmit_list)
+		print "receive:remain:%d overflow:%d" % \
+			(len(rx_skb_list), of_count_rx_skb_list)
+
+# called from perf, when it finds a correspoinding event
+def irq__softirq_entry(name, context, cpu, sec, nsec, pid, comm, vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+	all_event_list.append(event_info)
+
+def irq__softirq_exit(name, context, cpu, sec, nsec, pid, comm, vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+	all_event_list.append(event_info)
+
+def irq__softirq_raise(name, context, cpu, sec, nsec, pid, comm, vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+	all_event_list.append(event_info)
+
+def irq__irq_handler_entry(name, context, cpu, sec, nsec, pid, comm,
+			irq, irq_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			irq, irq_name)
+	all_event_list.append(event_info)
+
+def irq__irq_handler_exit(name, context, cpu, sec, nsec, pid, comm, irq, ret):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, irq, ret)
+	all_event_list.append(event_info)
+
+def napi__napi_poll(name, context, cpu, sec, nsec, pid, comm, napi, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			napi, dev_name)
+	all_event_list.append(event_info)
+
+def net__netif_receive_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+			skblen, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, dev_name)
+	all_event_list.append(event_info)
+
+def net__netif_rx(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+			skblen, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, dev_name)
+	all_event_list.append(event_info)
+
+def net__net_dev_queue(name, context, cpu, sec, nsec, pid, comm,
+			skbaddr, skblen, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, dev_name)
+	all_event_list.append(event_info)
+
+def net__net_dev_xmit(name, context, cpu, sec, nsec, pid, comm,
+			skbaddr, skblen, rc, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, rc ,dev_name)
+	all_event_list.append(event_info)
+
+def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm,
+			skbaddr, protocol, location):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, protocol, location)
+	all_event_list.append(event_info)
+
+def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr)
+	all_event_list.append(event_info)
+
+def skb__skb_copy_datagram_iovec(name, context, cpu, sec, nsec, pid, comm,
+	skbaddr, skblen):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen)
+	all_event_list.append(event_info)
+
+def handle_irq_handler_entry(event_info):
+	(name, context, cpu, time, pid, comm, irq, irq_name) = event_info
+	if cpu not in irq_dic.keys():
+		irq_dic[cpu] = []
+	irq_record = {'irq':irq, 'name':irq_name, 'cpu':cpu, 'irq_ent_t':time}
+	irq_dic[cpu].append(irq_record)
+
+def handle_irq_handler_exit(event_info):
+	(name, context, cpu, time, pid, comm, irq, ret) = event_info
+	if cpu not in irq_dic.keys():
+		return
+	irq_record = irq_dic[cpu].pop()
+	if irq != irq_record['irq']:
+		return
+	irq_record.update({'irq_ext_t':time})
+	# if an irq doesn't include NET_RX softirq, drop.
+	if 'event_list' in irq_record.keys():
+		irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_raise(event_info):
+	(name, context, cpu, time, pid, comm, vec) = event_info
+	if cpu not in irq_dic.keys() \
+	or len(irq_dic[cpu]) == 0:
+		return
+	irq_record = irq_dic[cpu].pop()
+	if 'event_list' in irq_record.keys():
+		irq_event_list = irq_record['event_list']
+	else:
+		irq_event_list = []
+	irq_event_list.append({'time':time, 'event':'sirq_raise'})
+	irq_record.update({'event_list':irq_event_list})
+	irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_entry(event_info):
+	(name, context, cpu, time, pid, comm, vec) = event_info
+	net_rx_dic[cpu] = {'sirq_ent_t':time, 'event_list':[]}
+
+def handle_irq_softirq_exit(event_info):
+	(name, context, cpu, time, pid, comm, vec) = event_info
+	irq_list = []
+	event_list = 0
+	if cpu in irq_dic.keys():
+		irq_list = irq_dic[cpu]
+		del irq_dic[cpu]
+	if cpu in net_rx_dic.keys():
+		sirq_ent_t = net_rx_dic[cpu]['sirq_ent_t']
+		event_list = net_rx_dic[cpu]['event_list']
+		del net_rx_dic[cpu]
+	if irq_list == [] or event_list == 0:
+		return
+	rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
+		    'irq_list':irq_list, 'event_list':event_list}
+	# merge information realted to a NET_RX softirq
+	receive_hunk_list.append(rec_data)
+
+def handle_napi_poll(event_info):
+	(name, context, cpu, time, pid, comm, napi, dev_name) = event_info
+	if cpu in net_rx_dic.keys():
+		event_list = net_rx_dic[cpu]['event_list']
+		rec_data = {'event_name':'napi_poll',
+				'dev':dev_name, 'event_t':time}
+		event_list.append(rec_data)
+
+def handle_netif_rx(event_info):
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, dev_name) = event_info
+	if cpu not in irq_dic.keys() \
+	or len(irq_dic[cpu]) == 0:
+		return
+	irq_record = irq_dic[cpu].pop()
+	if 'event_list' in irq_record.keys():
+		irq_event_list = irq_record['event_list']
+	else:
+		irq_event_list = []
+	irq_event_list.append({'time':time, 'event':'netif_rx',
+		'skbaddr':skbaddr, 'skblen':skblen, 'dev_name':dev_name})
+	irq_record.update({'event_list':irq_event_list})
+	irq_dic[cpu].append(irq_record)
+
+def handle_netif_receive_skb(event_info):
+	global of_count_rx_skb_list
+
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, dev_name) = event_info
+	if cpu in net_rx_dic.keys():
+		rec_data = {'event_name':'netif_receive_skb',
+			    'event_t':time, 'skbaddr':skbaddr, 'len':skblen}
+		event_list = net_rx_dic[cpu]['event_list']
+		event_list.append(rec_data)
+		rx_skb_list.insert(0, rec_data)
+		if len(rx_skb_list) > buffer_budget:
+			rx_skb_list.pop()
+			of_count_rx_skb_list += 1
+
+def handle_net_dev_queue(event_info):
+	global of_count_tx_queue_list
+
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, dev_name) = event_info
+	skb = {'dev':dev_name, 'skbaddr':skbaddr, 'len':skblen, 'queue_t':time}
+	tx_queue_list.insert(0, skb)
+	if len(tx_queue_list) > buffer_budget:
+		tx_queue_list.pop()
+		of_count_tx_queue_list += 1
+
+def handle_net_dev_xmit(event_info):
+	global of_count_tx_xmit_list
+
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, rc, dev_name) = event_info
+	if rc == 0: # NETDEV_TX_OK
+		for i in range(len(tx_queue_list)):
+			skb = tx_queue_list[i]
+			if skb['skbaddr'] == skbaddr:
+				skb['xmit_t'] = time
+				tx_xmit_list.insert(0, skb)
+				del tx_queue_list[i]
+				if len(tx_xmit_list) > buffer_budget:
+					tx_xmit_list.pop()
+					of_count_tx_xmit_list += 1
+				return
+
+def handle_kfree_skb(event_info):
+	(name, context, cpu, time, pid, comm,
+		skbaddr, protocol, location) = event_info
+	for i in range(len(tx_queue_list)):
+		skb = tx_queue_list[i]
+		if skb['skbaddr'] == skbaddr:
+			del tx_queue_list[i]
+			return
+	for i in range(len(tx_xmit_list)):
+		skb = tx_xmit_list[i]
+		if skb['skbaddr'] == skbaddr:
+			skb['free_t'] = time
+			tx_free_list.append(skb)
+			del tx_xmit_list[i]
+			return
+	for i in range(len(rx_skb_list)):
+		rec_data = rx_skb_list[i]
+		if rec_data['skbaddr'] == skbaddr:
+			rec_data.update({'handle':"kfree_skb",
+					'comm':comm, 'pid':pid, 'comm_t':time})
+			del rx_skb_list[i]
+			return
+
+def handle_consume_skb(event_info):
+	(name, context, cpu, time, pid, comm, skbaddr) = event_info
+	for i in range(len(tx_xmit_list)):
+		skb = tx_xmit_list[i]
+		if skb['skbaddr'] == skbaddr:
+			skb['free_t'] = time
+			tx_free_list.append(skb)
+			del tx_xmit_list[i]
+			return
+
+def handle_skb_copy_datagram_iovec(event_info):
+	(name, context, cpu, time, pid, comm, skbaddr, skblen) = event_info
+	for i in range(len(rx_skb_list)):
+		rec_data = rx_skb_list[i]
+		if skbaddr == rec_data['skbaddr']:
+			rec_data.update({'handle':"skb_copy_datagram_iovec",
+					'comm':comm, 'pid':pid, 'comm_t':time})
+			del rx_skb_list[i]
+			return
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 27e9ebe4076..a7729797fd9 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -82,6 +82,8 @@ extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2
 extern char *perf_pathdup(const char *fmt, ...)
 	__attribute__((format (printf, 1, 2)));
 
+#ifdef NO_STRLCPY
 extern size_t strlcpy(char *dest, const char *src, size_t size);
+#endif
 
 #endif /* __PERF_CACHE_H */
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index f231f43424d..e12d539417b 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -28,6 +28,9 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event)
 #define chain_for_each_child(child, parent)	\
 	list_for_each_entry(child, &parent->children, brothers)
 
+#define chain_for_each_child_safe(child, next, parent)	\
+	list_for_each_entry_safe(child, next, &parent->children, brothers)
+
 static void
 rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
 		    enum chain_mode mode)
@@ -86,10 +89,10 @@ __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
  * sort them by hit
  */
 static void
-sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
+sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root,
 		u64 min_hit, struct callchain_param *param __used)
 {
-	__sort_chain_flat(rb_root, node, min_hit);
+	__sort_chain_flat(rb_root, &root->node, min_hit);
 }
 
 static void __sort_chain_graph_abs(struct callchain_node *node,
@@ -108,11 +111,11 @@ static void __sort_chain_graph_abs(struct callchain_node *node,
 }
 
 static void
-sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_node *chain_root,
+sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root,
 		     u64 min_hit, struct callchain_param *param __used)
 {
-	__sort_chain_graph_abs(chain_root, min_hit);
-	rb_root->rb_node = chain_root->rb_root.rb_node;
+	__sort_chain_graph_abs(&chain_root->node, min_hit);
+	rb_root->rb_node = chain_root->node.rb_root.rb_node;
 }
 
 static void __sort_chain_graph_rel(struct callchain_node *node,
@@ -133,11 +136,11 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
 }
 
 static void
-sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_node *chain_root,
+sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root,
 		     u64 min_hit __used, struct callchain_param *param)
 {
-	__sort_chain_graph_rel(chain_root, param->min_percent / 100.0);
-	rb_root->rb_node = chain_root->rb_root.rb_node;
+	__sort_chain_graph_rel(&chain_root->node, param->min_percent / 100.0);
+	rb_root->rb_node = chain_root->node.rb_root.rb_node;
 }
 
 int register_callchain_param(struct callchain_param *param)
@@ -284,19 +287,18 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
 }
 
 static int
-__append_chain(struct callchain_node *root, struct resolved_chain *chain,
-	       unsigned int start, u64 period);
+append_chain(struct callchain_node *root, struct resolved_chain *chain,
+	     unsigned int start, u64 period);
 
 static void
-__append_chain_children(struct callchain_node *root,
-			struct resolved_chain *chain,
-			unsigned int start, u64 period)
+append_chain_children(struct callchain_node *root, struct resolved_chain *chain,
+		      unsigned int start, u64 period)
 {
 	struct callchain_node *rnode;
 
 	/* lookup in childrens */
 	chain_for_each_child(rnode, root) {
-		unsigned int ret = __append_chain(rnode, chain, start, period);
+		unsigned int ret = append_chain(rnode, chain, start, period);
 
 		if (!ret)
 			goto inc_children_hit;
@@ -309,8 +311,8 @@ inc_children_hit:
 }
 
 static int
-__append_chain(struct callchain_node *root, struct resolved_chain *chain,
-	       unsigned int start, u64 period)
+append_chain(struct callchain_node *root, struct resolved_chain *chain,
+	     unsigned int start, u64 period)
 {
 	struct callchain_list *cnode;
 	unsigned int i = start;
@@ -357,7 +359,7 @@ __append_chain(struct callchain_node *root, struct resolved_chain *chain,
 	}
 
 	/* We match the node and still have a part remaining */
-	__append_chain_children(root, chain, i, period);
+	append_chain_children(root, chain, i, period);
 
 	return 0;
 }
@@ -380,8 +382,8 @@ static void filter_context(struct ip_callchain *old, struct resolved_chain *new,
 }
 
 
-int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-		 struct map_symbol *syms, u64 period)
+int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
+		     struct map_symbol *syms, u64 period)
 {
 	struct resolved_chain *filtered;
 
@@ -398,9 +400,65 @@ int append_chain(struct callchain_node *root, struct ip_callchain *chain,
 	if (!filtered->nr)
 		goto end;
 
-	__append_chain_children(root, filtered, 0, period);
+	append_chain_children(&root->node, filtered, 0, period);
+
+	if (filtered->nr > root->max_depth)
+		root->max_depth = filtered->nr;
 end:
 	free(filtered);
 
 	return 0;
 }
+
+static int
+merge_chain_branch(struct callchain_node *dst, struct callchain_node *src,
+		   struct resolved_chain *chain)
+{
+	struct callchain_node *child, *next_child;
+	struct callchain_list *list, *next_list;
+	int old_pos = chain->nr;
+	int err = 0;
+
+	list_for_each_entry_safe(list, next_list, &src->val, list) {
+		chain->ips[chain->nr].ip = list->ip;
+		chain->ips[chain->nr].ms = list->ms;
+		chain->nr++;
+		list_del(&list->list);
+		free(list);
+	}
+
+	if (src->hit)
+		append_chain_children(dst, chain, 0, src->hit);
+
+	chain_for_each_child_safe(child, next_child, src) {
+		err = merge_chain_branch(dst, child, chain);
+		if (err)
+			break;
+
+		list_del(&child->brothers);
+		free(child);
+	}
+
+	chain->nr = old_pos;
+
+	return err;
+}
+
+int callchain_merge(struct callchain_root *dst, struct callchain_root *src)
+{
+	struct resolved_chain *chain;
+	int err;
+
+	chain = malloc(sizeof(*chain) +
+		       src->max_depth * sizeof(struct resolved_ip));
+	if (!chain)
+		return -ENOMEM;
+
+	chain->nr = 0;
+
+	err = merge_chain_branch(&dst->node, &src->node, chain);
+
+	free(chain);
+
+	return err;
+}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 6de4313924f..c15fb8c24ad 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -26,9 +26,14 @@ struct callchain_node {
 	u64			children_hit;
 };
 
+struct callchain_root {
+	u64			max_depth;
+	struct callchain_node	node;
+};
+
 struct callchain_param;
 
-typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_node *,
+typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_root *,
 				 u64, struct callchain_param *);
 
 struct callchain_param {
@@ -44,15 +49,16 @@ struct callchain_list {
 	struct list_head	list;
 };
 
-static inline void callchain_init(struct callchain_node *node)
+static inline void callchain_init(struct callchain_root *root)
 {
-	INIT_LIST_HEAD(&node->brothers);
-	INIT_LIST_HEAD(&node->children);
-	INIT_LIST_HEAD(&node->val);
+	INIT_LIST_HEAD(&root->node.brothers);
+	INIT_LIST_HEAD(&root->node.children);
+	INIT_LIST_HEAD(&root->node.val);
 
-	node->children_hit = 0;
-	node->parent = NULL;
-	node->hit = 0;
+	root->node.parent = NULL;
+	root->node.hit = 0;
+	root->node.children_hit = 0;
+	root->max_depth = 0;
 }
 
 static inline u64 cumul_hits(struct callchain_node *node)
@@ -61,8 +67,9 @@ static inline u64 cumul_hits(struct callchain_node *node)
 }
 
 int register_callchain_param(struct callchain_param *param);
-int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-		 struct map_symbol *syms, u64 period);
+int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
+		     struct map_symbol *syms, u64 period);
+int callchain_merge(struct callchain_root *dst, struct callchain_root *src);
 
 bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event);
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index be22ae6ef05..2022e874099 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -87,7 +87,7 @@ static void hist_entry__add_cpumode_period(struct hist_entry *self,
 
 static struct hist_entry *hist_entry__new(struct hist_entry *template)
 {
-	size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_node) : 0;
+	size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_root) : 0;
 	struct hist_entry *self = malloc(sizeof(*self) + callchain_size);
 
 	if (self != NULL) {
@@ -226,6 +226,8 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 
 		if (!cmp) {
 			iter->period += he->period;
+			if (symbol_conf.use_callchain)
+				callchain_merge(iter->callchain, he->callchain);
 			hist_entry__free(he);
 			return false;
 		}
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c
index 58a470d036d..bd749771142 100644
--- a/tools/perf/util/path.c
+++ b/tools/perf/util/path.c
@@ -22,6 +22,7 @@ static const char *get_perf_dir(void)
 	return ".";
 }
 
+#ifdef NO_STRLCPY
 size_t strlcpy(char *dest, const char *src, size_t size)
 {
 	size_t ret = strlen(src);
@@ -33,7 +34,7 @@ size_t strlcpy(char *dest, const char *src, size_t size)
 	}
 	return ret;
 }
-
+#endif
 
 static char *get_pathname(void)
 {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 46e531d09e8..0b91053a7d1 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -70,7 +70,7 @@ struct hist_entry {
 		struct hist_entry *pair;
 		struct rb_root	  sorted_chain;
 	};
-	struct callchain_node	callchain[0];
+	struct callchain_root	callchain[0];
 };
 
 enum sort_type {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index b2f5ae97f33..b39f499e575 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -388,6 +388,20 @@ size_t dso__fprintf_buildid(struct dso *self, FILE *fp)
 	return fprintf(fp, "%s", sbuild_id);
 }
 
+size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+	struct symbol_name_rb_node *pos;
+
+	for (nd = rb_first(&self->symbol_names[type]); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
+		fprintf(fp, "%s\n", pos->sym.name);
+	}
+
+	return ret;
+}
+
 size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp)
 {
 	struct rb_node *nd;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index ea95c2756f0..038f2201ee0 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -182,6 +182,7 @@ size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp);
 size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits);
 
 size_t dso__fprintf_buildid(struct dso *self, FILE *fp);
+size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp);
 size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp);
 
 enum dso_origin {
diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c
index 66f2d583d8c..6d0df809a2e 100644
--- a/tools/perf/util/ui/browser.c
+++ b/tools/perf/util/ui/browser.c
@@ -1,16 +1,6 @@
-#define _GNU_SOURCE
-#include <stdio.h>
-#undef _GNU_SOURCE
-/*
- * slang versions <= 2.0.6 have a "#if HAVE_LONG_LONG" that breaks
- * the build if it isn't defined. Use the equivalent one that glibc
- * has on features.h.
- */
-#include <features.h>
-#ifndef HAVE_LONG_LONG
-#define HAVE_LONG_LONG __GLIBC_HAVE_LONG_LONG
-#endif
 #include <slang.h>
+#include "libslang.h"
+#include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <stdlib.h>
@@ -19,17 +9,9 @@
 #include "helpline.h"
 #include "../color.h"
 #include "../util.h"
+#include <stdio.h>
 
-#if SLANG_VERSION < 20104
-#define sltt_set_color(obj, name, fg, bg) \
-	SLtt_set_color(obj,(char *)name, (char *)fg, (char *)bg)
-#else
-#define sltt_set_color SLtt_set_color
-#endif
-
-newtComponent newt_form__new(void);
-
-int ui_browser__percent_color(double percent, bool current)
+static int ui_browser__percent_color(double percent, bool current)
 {
 	if (current)
 		return HE_COLORSET_SELECTED;
@@ -40,6 +22,23 @@ int ui_browser__percent_color(double percent, bool current)
 	return HE_COLORSET_NORMAL;
 }
 
+void ui_browser__set_color(struct ui_browser *self __used, int color)
+{
+	SLsmg_set_color(color);
+}
+
+void ui_browser__set_percent_color(struct ui_browser *self,
+				   double percent, bool current)
+{
+	 int color = ui_browser__percent_color(percent, current);
+	 ui_browser__set_color(self, color);
+}
+
+void ui_browser__gotorc(struct ui_browser *self, int y, int x)
+{
+	SLsmg_gotorc(self->y + y, self->x + x);
+}
+
 void ui_browser__list_head_seek(struct ui_browser *self, off_t offset, int whence)
 {
 	struct list_head *head = self->entries;
@@ -111,7 +110,7 @@ unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self)
 	nd = self->top;
 
 	while (nd != NULL) {
-		SLsmg_gotorc(self->y + row, self->x);
+		ui_browser__gotorc(self, row, 0);
 		self->write(self, nd, row);
 		if (++row == self->height)
 			break;
@@ -131,13 +130,10 @@ void ui_browser__refresh_dimensions(struct ui_browser *self)
 	int cols, rows;
 	newtGetScreenSize(&cols, &rows);
 
-	if (self->width > cols - 4)
-		self->width = cols - 4;
-	self->height = rows - 5;
-	if (self->height > self->nr_entries)
-		self->height = self->nr_entries;
-	self->y  = (rows - self->height) / 2;
-	self->x = (cols - self->width) / 2;
+	self->width = cols - 1;
+	self->height = rows - 2;
+	self->y = 1;
+	self->x = 0;
 }
 
 void ui_browser__reset_index(struct ui_browser *self)
@@ -146,34 +142,48 @@ void ui_browser__reset_index(struct ui_browser *self)
 	self->seek(self, 0, SEEK_SET);
 }
 
+void ui_browser__add_exit_key(struct ui_browser *self, int key)
+{
+	newtFormAddHotKey(self->form, key);
+}
+
+void ui_browser__add_exit_keys(struct ui_browser *self, int keys[])
+{
+	int i = 0;
+
+	while (keys[i] && i < 64) {
+		ui_browser__add_exit_key(self, keys[i]);
+		++i;
+	}
+}
+
 int ui_browser__show(struct ui_browser *self, const char *title,
 		     const char *helpline, ...)
 {
 	va_list ap;
+	int keys[] = { NEWT_KEY_UP, NEWT_KEY_DOWN, NEWT_KEY_PGUP,
+		       NEWT_KEY_PGDN, NEWT_KEY_HOME, NEWT_KEY_END, ' ',
+		       NEWT_KEY_LEFT, NEWT_KEY_ESCAPE, 'q', CTRL('c'), 0 };
 
-	if (self->form != NULL) {
+	if (self->form != NULL)
 		newtFormDestroy(self->form);
-		newtPopWindow();
-	}
+
 	ui_browser__refresh_dimensions(self);
-	newtCenteredWindow(self->width, self->height, title);
-	self->form = newt_form__new();
+	self->form = newtForm(NULL, NULL, 0);
 	if (self->form == NULL)
 		return -1;
 
-	self->sb = newtVerticalScrollbar(self->width, 0, self->height,
+	self->sb = newtVerticalScrollbar(self->width, 1, self->height,
 					 HE_COLORSET_NORMAL,
 					 HE_COLORSET_SELECTED);
 	if (self->sb == NULL)
 		return -1;
 
-	newtFormAddHotKey(self->form, NEWT_KEY_UP);
-	newtFormAddHotKey(self->form, NEWT_KEY_DOWN);
-	newtFormAddHotKey(self->form, NEWT_KEY_PGUP);
-	newtFormAddHotKey(self->form, NEWT_KEY_PGDN);
-	newtFormAddHotKey(self->form, NEWT_KEY_HOME);
-	newtFormAddHotKey(self->form, NEWT_KEY_END);
-	newtFormAddHotKey(self->form, ' ');
+	SLsmg_gotorc(0, 0);
+	ui_browser__set_color(self, NEWT_COLORSET_ROOT);
+	slsmg_write_nstring(title, self->width);
+
+	ui_browser__add_exit_keys(self, keys);
 	newtFormAddComponent(self->form, self->sb);
 
 	va_start(ap, helpline);
@@ -185,7 +195,6 @@ int ui_browser__show(struct ui_browser *self, const char *title,
 void ui_browser__hide(struct ui_browser *self)
 {
 	newtFormDestroy(self->form);
-	newtPopWindow();
 	self->form = NULL;
 	ui_helpline__pop();
 }
@@ -196,28 +205,28 @@ int ui_browser__refresh(struct ui_browser *self)
 
 	newtScrollbarSet(self->sb, self->index, self->nr_entries - 1);
 	row = self->refresh(self);
-	SLsmg_set_color(HE_COLORSET_NORMAL);
+	ui_browser__set_color(self, HE_COLORSET_NORMAL);
 	SLsmg_fill_region(self->y + row, self->x,
 			  self->height - row, self->width, ' ');
 
 	return 0;
 }
 
-int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es)
+int ui_browser__run(struct ui_browser *self)
 {
+	struct newtExitStruct es;
+
 	if (ui_browser__refresh(self) < 0)
 		return -1;
 
 	while (1) {
 		off_t offset;
 
-		newtFormRun(self->form, es);
+		newtFormRun(self->form, &es);
 
-		if (es->reason != NEWT_EXIT_HOTKEY)
+		if (es.reason != NEWT_EXIT_HOTKEY)
 			break;
-		if (is_exit_key(es->u.key))
-			return es->u.key;
-		switch (es->u.key) {
+		switch (es.u.key) {
 		case NEWT_KEY_DOWN:
 			if (self->index == self->nr_entries - 1)
 				break;
@@ -274,12 +283,12 @@ int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es)
 			self->seek(self, -offset, SEEK_END);
 			break;
 		default:
-			return es->u.key;
+			return es.u.key;
 		}
 		if (ui_browser__refresh(self) < 0)
 			return -1;
 	}
-	return 0;
+	return -1;
 }
 
 unsigned int ui_browser__list_head_refresh(struct ui_browser *self)
@@ -294,7 +303,7 @@ unsigned int ui_browser__list_head_refresh(struct ui_browser *self)
 	pos = self->top;
 
 	list_for_each_from(pos, head) {
-		SLsmg_gotorc(self->y + row, self->x);
+		ui_browser__gotorc(self, row, 0);
 		self->write(self, pos, row);
 		if (++row == self->height)
 			break;
diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h
index 0b9f829214f..0dc7e4da36f 100644
--- a/tools/perf/util/ui/browser.h
+++ b/tools/perf/util/ui/browser.h
@@ -25,16 +25,21 @@ struct ui_browser {
 };
 
 
-int ui_browser__percent_color(double percent, bool current);
+void ui_browser__set_color(struct ui_browser *self, int color);
+void ui_browser__set_percent_color(struct ui_browser *self,
+				   double percent, bool current);
 bool ui_browser__is_current_entry(struct ui_browser *self, unsigned row);
 void ui_browser__refresh_dimensions(struct ui_browser *self);
 void ui_browser__reset_index(struct ui_browser *self);
 
+void ui_browser__gotorc(struct ui_browser *self, int y, int x);
+void ui_browser__add_exit_key(struct ui_browser *self, int key);
+void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]);
 int ui_browser__show(struct ui_browser *self, const char *title,
 		     const char *helpline, ...);
 void ui_browser__hide(struct ui_browser *self);
 int ui_browser__refresh(struct ui_browser *self);
-int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es);
+int ui_browser__run(struct ui_browser *self);
 
 void ui_browser__rb_tree_seek(struct ui_browser *self, off_t offset, int whence);
 unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self);
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index a90273e63f4..82b78f99251 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -40,14 +40,12 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
 
 	if (ol->offset != -1) {
 		struct objdump_line_rb_node *olrb = objdump_line__rb(ol);
-		int color = ui_browser__percent_color(olrb->percent, current_entry);
-		SLsmg_set_color(color);
+		ui_browser__set_percent_color(self, olrb->percent, current_entry);
 		slsmg_printf(" %7.2f ", olrb->percent);
 		if (!current_entry)
-			SLsmg_set_color(HE_COLORSET_CODE);
+			ui_browser__set_color(self, HE_COLORSET_CODE);
 	} else {
-		int color = ui_browser__percent_color(0, current_entry);
-		SLsmg_set_color(color);
+		ui_browser__set_percent_color(self, 0, current_entry);
 		slsmg_write_nstring(" ", 9);
 	}
 
@@ -135,32 +133,31 @@ static void annotate_browser__set_top(struct annotate_browser *self,
 	self->curr_hot = nd;
 }
 
-static int annotate_browser__run(struct annotate_browser *self,
-				 struct newtExitStruct *es)
+static int annotate_browser__run(struct annotate_browser *self)
 {
 	struct rb_node *nd;
 	struct hist_entry *he = self->b.priv;
+	int key;
 
 	if (ui_browser__show(&self->b, he->ms.sym->name,
-			     "<- or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
+			     "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
 		return -1;
-
-	newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-	newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT);
+	/*
+	 * To allow builtin-annotate to cycle thru multiple symbols by
+	 * examining the exit key for this function.
+	 */
+	ui_browser__add_exit_key(&self->b, NEWT_KEY_RIGHT);
 
 	nd = self->curr_hot;
 	if (nd) {
-		newtFormAddHotKey(self->b.form, NEWT_KEY_TAB);
-		newtFormAddHotKey(self->b.form, NEWT_KEY_UNTAB);
+		int tabs[] = { NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0 };
+		ui_browser__add_exit_keys(&self->b, tabs);
 	}
 
 	while (1) {
-		ui_browser__run(&self->b, es);
-
-		if (es->reason != NEWT_EXIT_HOTKEY)
-			break;
+		key = ui_browser__run(&self->b);
 
-		switch (es->u.key) {
+		switch (key) {
 		case NEWT_KEY_TAB:
 			nd = rb_prev(nd);
 			if (nd == NULL)
@@ -179,12 +176,11 @@ static int annotate_browser__run(struct annotate_browser *self,
 	}
 out:
 	ui_browser__hide(&self->b);
-	return es->u.key;
+	return key;
 }
 
 int hist_entry__tui_annotate(struct hist_entry *self)
 {
-	struct newtExitStruct es;
 	struct objdump_line *pos, *n;
 	struct objdump_line_rb_node *rbpos;
 	LIST_HEAD(head);
@@ -232,7 +228,7 @@ int hist_entry__tui_annotate(struct hist_entry *self)
 		annotate_browser__set_top(&browser, browser.curr_hot);
 
 	browser.b.width += 18; /* Percentage */
-	ret = annotate_browser__run(&browser, &es);
+	ret = annotate_browser__run(&browser);
 	list_for_each_entry_safe(pos, n, &head, node) {
 		list_del(&pos->node);
 		objdump_line__free(pos);
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 6866aa4c41e..ebda8c3fde9 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -58,6 +58,11 @@ static char callchain_list__folded(const struct callchain_list *self)
 	return map_symbol__folded(&self->ms);
 }
 
+static void map_symbol__set_folding(struct map_symbol *self, bool unfold)
+{
+	self->unfolded = unfold ? self->has_children : false;
+}
+
 static int callchain_node__count_rows_rb_tree(struct callchain_node *self)
 {
 	int n = 0;
@@ -129,16 +134,16 @@ static void callchain_node__init_have_children_rb_tree(struct callchain_node *se
 	for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) {
 		struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node);
 		struct callchain_list *chain;
-		int first = true;
+		bool first = true;
 
 		list_for_each_entry(chain, &child->val, list) {
 			if (first) {
 				first = false;
 				chain->ms.has_children = chain->list.next != &child->val ||
-							 rb_first(&child->rb_root) != NULL;
+							 !RB_EMPTY_ROOT(&child->rb_root);
 			} else
 				chain->ms.has_children = chain->list.next == &child->val &&
-							 rb_first(&child->rb_root) != NULL;
+							 !RB_EMPTY_ROOT(&child->rb_root);
 		}
 
 		callchain_node__init_have_children_rb_tree(child);
@@ -150,7 +155,7 @@ static void callchain_node__init_have_children(struct callchain_node *self)
 	struct callchain_list *chain;
 
 	list_for_each_entry(chain, &self->val, list)
-		chain->ms.has_children = rb_first(&self->rb_root) != NULL;
+		chain->ms.has_children = !RB_EMPTY_ROOT(&self->rb_root);
 
 	callchain_node__init_have_children_rb_tree(self);
 }
@@ -168,6 +173,7 @@ static void callchain__init_have_children(struct rb_root *self)
 static void hist_entry__init_have_children(struct hist_entry *self)
 {
 	if (!self->init_have_children) {
+		self->ms.has_children = !RB_EMPTY_ROOT(&self->sorted_chain);
 		callchain__init_have_children(&self->sorted_chain);
 		self->init_have_children = true;
 	}
@@ -195,43 +201,114 @@ static bool hist_browser__toggle_fold(struct hist_browser *self)
 	return false;
 }
 
-static int hist_browser__run(struct hist_browser *self, const char *title,
-			     struct newtExitStruct *es)
+static int callchain_node__set_folding_rb_tree(struct callchain_node *self, bool unfold)
+{
+	int n = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) {
+		struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node);
+		struct callchain_list *chain;
+		bool has_children = false;
+
+		list_for_each_entry(chain, &child->val, list) {
+			++n;
+			map_symbol__set_folding(&chain->ms, unfold);
+			has_children = chain->ms.has_children;
+		}
+
+		if (has_children)
+			n += callchain_node__set_folding_rb_tree(child, unfold);
+	}
+
+	return n;
+}
+
+static int callchain_node__set_folding(struct callchain_node *node, bool unfold)
+{
+	struct callchain_list *chain;
+	bool has_children = false;
+	int n = 0;
+
+	list_for_each_entry(chain, &node->val, list) {
+		++n;
+		map_symbol__set_folding(&chain->ms, unfold);
+		has_children = chain->ms.has_children;
+	}
+
+	if (has_children)
+		n += callchain_node__set_folding_rb_tree(node, unfold);
+
+	return n;
+}
+
+static int callchain__set_folding(struct rb_root *chain, bool unfold)
+{
+	struct rb_node *nd;
+	int n = 0;
+
+	for (nd = rb_first(chain); nd; nd = rb_next(nd)) {
+		struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
+		n += callchain_node__set_folding(node, unfold);
+	}
+
+	return n;
+}
+
+static void hist_entry__set_folding(struct hist_entry *self, bool unfold)
+{
+	hist_entry__init_have_children(self);
+	map_symbol__set_folding(&self->ms, unfold);
+
+	if (self->ms.has_children) {
+		int n = callchain__set_folding(&self->sorted_chain, unfold);
+		self->nr_rows = unfold ? n : 0;
+	} else
+		self->nr_rows = 0;
+}
+
+static void hists__set_folding(struct hists *self, bool unfold)
+{
+	struct rb_node *nd;
+
+	self->nr_entries = 0;
+
+	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
+		hist_entry__set_folding(he, unfold);
+		self->nr_entries += 1 + he->nr_rows;
+	}
+}
+
+static void hist_browser__set_folding(struct hist_browser *self, bool unfold)
+{
+	hists__set_folding(self->hists, unfold);
+	self->b.nr_entries = self->hists->nr_entries;
+	/* Go to the start, we may be way after valid entries after a collapse */
+	ui_browser__reset_index(&self->b);
+}
+
+static int hist_browser__run(struct hist_browser *self, const char *title)
 {
-	char str[256], unit;
-	unsigned long nr_events = self->hists->stats.nr_events[PERF_RECORD_SAMPLE];
+	int key;
+	int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't',
+			    NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, 0, };
 
 	self->b.entries = &self->hists->entries;
 	self->b.nr_entries = self->hists->nr_entries;
 
 	hist_browser__refresh_dimensions(self);
 
-	nr_events = convert_unit(nr_events, &unit);
-	snprintf(str, sizeof(str), "Events: %lu%c                            ",
-		 nr_events, unit);
-	newtDrawRootText(0, 0, str);
-
 	if (ui_browser__show(&self->b, title,
 			     "Press '?' for help on key bindings") < 0)
 		return -1;
 
-	newtFormAddHotKey(self->b.form, 'a');
-	newtFormAddHotKey(self->b.form, '?');
-	newtFormAddHotKey(self->b.form, 'h');
-	newtFormAddHotKey(self->b.form, 'd');
-	newtFormAddHotKey(self->b.form, 'D');
-	newtFormAddHotKey(self->b.form, 't');
-
-	newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-	newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT);
-	newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER);
+	ui_browser__add_exit_keys(&self->b, exit_keys);
 
 	while (1) {
-		ui_browser__run(&self->b, es);
+		key = ui_browser__run(&self->b);
 
-		if (es->reason != NEWT_EXIT_HOTKEY)
-			break;
-		switch (es->u.key) {
+		switch (key) {
 		case 'D': { /* Debug */
 			static int seq;
 			struct hist_entry *h = rb_entry(self->b.top,
@@ -245,18 +322,26 @@ static int hist_browser__run(struct hist_browser *self, const char *title,
 					   self->b.top_idx,
 					   h->row_offset, h->nr_rows);
 		}
-			continue;
+			break;
+		case 'C':
+			/* Collapse the whole world. */
+			hist_browser__set_folding(self, false);
+			break;
+		case 'E':
+			/* Expand the whole world. */
+			hist_browser__set_folding(self, true);
+			break;
 		case NEWT_KEY_ENTER:
 			if (hist_browser__toggle_fold(self))
 				break;
 			/* fall thru */
 		default:
-			return 0;
+			goto out;
 		}
 	}
-
+out:
 	ui_browser__hide(&self->b);
-	return 0;
+	return key;
 }
 
 static char *callchain_list__sym_name(struct callchain_list *self,
@@ -306,15 +391,10 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
 			int color;
 			bool was_first = first;
 
-			if (first) {
+			if (first)
 				first = false;
-				chain->ms.has_children = chain->list.next != &child->val ||
-							 rb_first(&child->rb_root) != NULL;
-			} else {
+			else
 				extra_offset = LEVEL_OFFSET_STEP;
-				chain->ms.has_children = chain->list.next == &child->val &&
-							 rb_first(&child->rb_root) != NULL;
-			}
 
 			folded_sign = callchain_list__folded(chain);
 			if (*row_offset != 0) {
@@ -341,8 +421,8 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
 				*is_current_entry = true;
 			}
 
-			SLsmg_set_color(color);
-			SLsmg_gotorc(self->b.y + row, self->b.x);
+			ui_browser__set_color(&self->b, color);
+			ui_browser__gotorc(&self->b, row, 0);
 			slsmg_write_nstring(" ", offset + extra_offset);
 			slsmg_printf("%c ", folded_sign);
 			slsmg_write_nstring(str, width);
@@ -384,12 +464,7 @@ static int hist_browser__show_callchain_node(struct hist_browser *self,
 	list_for_each_entry(chain, &node->val, list) {
 		char ipstr[BITS_PER_LONG / 4 + 1], *s;
 		int color;
-		/*
-		 * FIXME: This should be moved to somewhere else,
-		 * probably when the callchain is created, so as not to
-		 * traverse it all over again
-		 */
-		chain->ms.has_children = rb_first(&node->rb_root) != NULL;
+
 		folded_sign = callchain_list__folded(chain);
 
 		if (*row_offset != 0) {
@@ -405,8 +480,8 @@ static int hist_browser__show_callchain_node(struct hist_browser *self,
 		}
 
 		s = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
-		SLsmg_gotorc(self->b.y + row, self->b.x);
-		SLsmg_set_color(color);
+		ui_browser__gotorc(&self->b, row, 0);
+		ui_browser__set_color(&self->b, color);
 		slsmg_write_nstring(" ", offset);
 		slsmg_printf("%c ", folded_sign);
 		slsmg_write_nstring(s, width - 2);
@@ -465,7 +540,7 @@ static int hist_browser__show_entry(struct hist_browser *self,
 	}
 
 	if (symbol_conf.use_callchain) {
-		entry->ms.has_children = !RB_EMPTY_ROOT(&entry->sorted_chain);
+		hist_entry__init_have_children(entry);
 		folded_sign = hist_entry__folded(entry);
 	}
 
@@ -484,8 +559,8 @@ static int hist_browser__show_entry(struct hist_browser *self,
 				color = HE_COLORSET_NORMAL;
 		}
 
-		SLsmg_set_color(color);
-		SLsmg_gotorc(self->b.y + row, self->b.x);
+		ui_browser__set_color(&self->b, color);
+		ui_browser__gotorc(&self->b, row, 0);
 		if (symbol_conf.use_callchain) {
 			slsmg_printf("%c ", folded_sign);
 			width -= 2;
@@ -687,8 +762,6 @@ static struct hist_browser *hist_browser__new(struct hists *hists)
 
 static void hist_browser__delete(struct hist_browser *self)
 {
-	newtFormDestroy(self->b.form);
-	newtPopWindow();
 	free(self);
 }
 
@@ -702,21 +775,26 @@ static struct thread *hist_browser__selected_thread(struct hist_browser *self)
 	return self->he_selection->thread;
 }
 
-static int hist_browser__title(char *bf, size_t size, const char *ev_name,
-			       const struct dso *dso, const struct thread *thread)
+static int hists__browser_title(struct hists *self, char *bf, size_t size,
+				const char *ev_name, const struct dso *dso,
+				const struct thread *thread)
 {
-	int printed = 0;
+	char unit;
+	int printed;
+	unsigned long nr_events = self->stats.nr_events[PERF_RECORD_SAMPLE];
+
+	nr_events = convert_unit(nr_events, &unit);
+	printed = snprintf(bf, size, "Events: %lu%c %s", nr_events, unit, ev_name);
 
 	if (thread)
 		printed += snprintf(bf + printed, size - printed,
-				    "Thread: %s(%d)",
-				    (thread->comm_set ?  thread->comm : ""),
+				    ", Thread: %s(%d)",
+				    (thread->comm_set ? thread->comm : ""),
 				    thread->pid);
 	if (dso)
 		printed += snprintf(bf + printed, size - printed,
-				    "%sDSO: %s", thread ? " " : "",
-				    dso->short_name);
-	return printed ?: snprintf(bf, size, "Event: %s", ev_name);
+				    ", DSO: %s", dso->short_name);
+	return printed;
 }
 
 int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
@@ -725,7 +803,6 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
 	struct pstack *fstack;
 	const struct thread *thread_filter = NULL;
 	const struct dso *dso_filter = NULL;
-	struct newtExitStruct es;
 	char msg[160];
 	int key = -1;
 
@@ -738,9 +815,8 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
 
 	ui_helpline__push(helpline);
 
-	hist_browser__title(msg, sizeof(msg), ev_name,
-			    dso_filter, thread_filter);
-
+	hists__browser_title(self, msg, sizeof(msg), ev_name,
+			     dso_filter, thread_filter);
 	while (1) {
 		const struct thread *thread;
 		const struct dso *dso;
@@ -749,70 +825,63 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
 		    browse_map = -2;
 
-		if (hist_browser__run(browser, msg, &es))
-			break;
+		key = hist_browser__run(browser, msg);
 
 		thread = hist_browser__selected_thread(browser);
 		dso = browser->selection->map ? browser->selection->map->dso : NULL;
 
-		if (es.reason == NEWT_EXIT_HOTKEY) {
-			key = es.u.key;
-
-			switch (key) {
-			case NEWT_KEY_F1:
-				goto do_help;
-			case NEWT_KEY_TAB:
-			case NEWT_KEY_UNTAB:
-				/*
-				 * Exit the browser, let hists__browser_tree
-				 * go to the next or previous
-				 */
-				goto out_free_stack;
-			default:;
-			}
-
-			switch (key) {
-			case 'a':
-				if (browser->selection->map == NULL ||
-				    browser->selection->map->dso->annotate_warned)
-					continue;
-				goto do_annotate;
-			case 'd':
-				goto zoom_dso;
-			case 't':
-				goto zoom_thread;
-			case 'h':
-			case '?':
-do_help:
-				ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
-						"<-        Zoom out\n"
-						"a         Annotate current symbol\n"
-						"h/?/F1    Show this window\n"
-						"d         Zoom into current DSO\n"
-						"t         Zoom into current Thread\n"
-						"q/CTRL+C  Exit browser");
+		switch (key) {
+		case NEWT_KEY_TAB:
+		case NEWT_KEY_UNTAB:
+			/*
+			 * Exit the browser, let hists__browser_tree
+			 * go to the next or previous
+			 */
+			goto out_free_stack;
+		case 'a':
+			if (browser->selection->map == NULL &&
+			    browser->selection->map->dso->annotate_warned)
 				continue;
-			default:;
-			}
-			if (is_exit_key(key)) {
-				if (key == NEWT_KEY_ESCAPE &&
-				    !ui__dialog_yesno("Do you really want to exit?"))
-					continue;
-				break;
-			}
-
-			if (es.u.key == NEWT_KEY_LEFT) {
-				const void *top;
+			goto do_annotate;
+		case 'd':
+			goto zoom_dso;
+		case 't':
+			goto zoom_thread;
+		case NEWT_KEY_F1:
+		case 'h':
+		case '?':
+			ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
+					"<-        Zoom out\n"
+					"a         Annotate current symbol\n"
+					"h/?/F1    Show this window\n"
+					"C         Collapse all callchains\n"
+					"E         Expand all callchains\n"
+					"d         Zoom into current DSO\n"
+					"t         Zoom into current Thread\n"
+					"q/CTRL+C  Exit browser");
+			continue;
+		case NEWT_KEY_ENTER:
+		case NEWT_KEY_RIGHT:
+			/* menu */
+			break;
+		case NEWT_KEY_LEFT: {
+			const void *top;
 
-				if (pstack__empty(fstack))
-					continue;
-				top = pstack__pop(fstack);
-				if (top == &dso_filter)
-					goto zoom_out_dso;
-				if (top == &thread_filter)
-					goto zoom_out_thread;
+			if (pstack__empty(fstack))
 				continue;
-			}
+			top = pstack__pop(fstack);
+			if (top == &dso_filter)
+				goto zoom_out_dso;
+			if (top == &thread_filter)
+				goto zoom_out_thread;
+			continue;
+		}
+		case NEWT_KEY_ESCAPE:
+			if (!ui__dialog_yesno("Do you really want to exit?"))
+				continue;
+			/* Fall thru */
+		default:
+			goto out_free_stack;
 		}
 
 		if (browser->selection->sym != NULL &&
@@ -885,8 +954,8 @@ zoom_out_dso:
 				pstack__push(fstack, &dso_filter);
 			}
 			hists__filter_by_dso(self, dso_filter);
-			hist_browser__title(msg, sizeof(msg), ev_name,
-					    dso_filter, thread_filter);
+			hists__browser_title(self, msg, sizeof(msg), ev_name,
+					     dso_filter, thread_filter);
 			hist_browser__reset(browser);
 		} else if (choice == zoom_thread) {
 zoom_thread:
@@ -903,8 +972,8 @@ zoom_out_thread:
 				pstack__push(fstack, &thread_filter);
 			}
 			hists__filter_by_thread(self, thread_filter);
-			hist_browser__title(msg, sizeof(msg), ev_name,
-					    dso_filter, thread_filter);
+			hists__browser_title(self, msg, sizeof(msg), ev_name,
+					     dso_filter, thread_filter);
 			hist_browser__reset(browser);
 		}
 	}
@@ -925,10 +994,6 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)
 		const char *ev_name = __event_name(hists->type, hists->config);
 
 		key = hists__browse(hists, help, ev_name);
-
-		if (is_exit_key(key))
-			break;
-
 		switch (key) {
 		case NEWT_KEY_TAB:
 			next = rb_next(nd);
@@ -940,7 +1005,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)
 				continue;
 			nd = rb_prev(nd);
 		default:
-			break;
+			return key;
 		}
 	}
 
diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c
index 142b825b42b..e35437dfa5b 100644
--- a/tools/perf/util/ui/browsers/map.c
+++ b/tools/perf/util/ui/browsers/map.c
@@ -1,6 +1,5 @@
 #include "../libslang.h"
 #include <elf.h>
-#include <newt.h>
 #include <sys/ttydefaults.h>
 #include <ctype.h>
 #include <string.h>
@@ -47,7 +46,6 @@ out_free_form:
 struct map_browser {
 	struct ui_browser b;
 	struct map	  *map;
-	u16		  namelen;
 	u8		  addrlen;
 };
 
@@ -56,14 +54,16 @@ static void map_browser__write(struct ui_browser *self, void *nd, int row)
 	struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
 	struct map_browser *mb = container_of(self, struct map_browser, b);
 	bool current_entry = ui_browser__is_current_entry(self, row);
-	int color = ui_browser__percent_color(0, current_entry);
+	int width;
 
-	SLsmg_set_color(color);
+	ui_browser__set_percent_color(self, 0, current_entry);
 	slsmg_printf("%*llx %*llx %c ",
 		     mb->addrlen, sym->start, mb->addrlen, sym->end,
 		     sym->binding == STB_GLOBAL ? 'g' :
 		     sym->binding == STB_LOCAL  ? 'l' : 'w');
-	slsmg_write_nstring(sym->name, mb->namelen);
+	width = self->width - ((mb->addrlen * 2) + 4);
+	if (width > 0)
+		slsmg_write_nstring(sym->name, width);
 }
 
 /* FIXME uber-kludgy, see comment on cmd_report... */
@@ -98,31 +98,29 @@ static int map_browser__search(struct map_browser *self)
 	return 0;
 }
 
-static int map_browser__run(struct map_browser *self, struct newtExitStruct *es)
+static int map_browser__run(struct map_browser *self)
 {
+	int key;
+
 	if (ui_browser__show(&self->b, self->map->dso->long_name,
 			     "Press <- or ESC to exit, %s / to search",
 			     verbose ? "" : "restart with -v to use") < 0)
 		return -1;
 
-	newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-	newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER);
 	if (verbose)
-		newtFormAddHotKey(self->b.form, '/');
+		ui_browser__add_exit_key(&self->b, '/');
 
 	while (1) {
-		ui_browser__run(&self->b, es);
+		key = ui_browser__run(&self->b);
 
-		if (es->reason != NEWT_EXIT_HOTKEY)
-			break;
-		if (verbose && es->u.key == '/')
+		if (verbose && key == '/')
 			map_browser__search(self);
 		else
 			break;
 	}
 
 	ui_browser__hide(&self->b);
-	return 0;
+	return key;
 }
 
 int map__browse(struct map *self)
@@ -136,7 +134,6 @@ int map__browse(struct map *self)
 		},
 		.map = self,
 	};
-	struct newtExitStruct es;
 	struct rb_node *nd;
 	char tmp[BITS_PER_LONG / 4];
 	u64 maxaddr = 0;
@@ -144,8 +141,6 @@ int map__browse(struct map *self)
 	for (nd = rb_first(mb.b.entries); nd; nd = rb_next(nd)) {
 		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
 
-		if (mb.namelen < pos->namelen)
-			mb.namelen = pos->namelen;
 		if (maxaddr < pos->end)
 			maxaddr = pos->end;
 		if (verbose) {
@@ -156,6 +151,5 @@ int map__browse(struct map *self)
 	}
 
 	mb.addrlen = snprintf(tmp, sizeof(tmp), "%llx", maxaddr);
-	mb.b.width += mb.addrlen * 2 + 4 + mb.namelen;
-	return map_browser__run(&mb, &es);
+	return map_browser__run(&mb);
 }
diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c
index 04600e26cee..9706d9d4027 100644
--- a/tools/perf/util/ui/util.c
+++ b/tools/perf/util/ui/util.c
@@ -11,8 +11,6 @@
 #include "helpline.h"
 #include "util.h"
 
-newtComponent newt_form__new(void);
-
 static void newt_form__set_exit_keys(newtComponent self)
 {
 	newtFormAddHotKey(self, NEWT_KEY_LEFT);
@@ -22,7 +20,7 @@ static void newt_form__set_exit_keys(newtComponent self)
 	newtFormAddHotKey(self, CTRL('c'));
 }
 
-newtComponent newt_form__new(void)
+static newtComponent newt_form__new(void)
 {
 	newtComponent self = newtForm(NULL, NULL, 0);
 	if (self)
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index f380fed7435..7562707ddd1 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -266,19 +266,6 @@ bool strglobmatch(const char *str, const char *pat);
 bool strlazymatch(const char *str, const char *pat);
 unsigned long convert_unit(unsigned long value, char *unit);
 
-#ifndef ESC
-#define ESC 27
-#endif
-
-static inline bool is_exit_key(int key)
-{
-	char up;
-	if (key == CTRL('c') || key == ESC)
-		return true;
-	up = toupper(key);
-	return up == 'Q';
-}
-
 #define _STR(x) #x
 #define STR(x) _STR(x)