From 86069782d62e731b4835a0cf8eb7d1d0e17cf306 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: x86: add a list for custom page fault handlers.

Provides kernel modules a way to register custom page fault handlers.
On every page fault this will call a list of registered functions. The
functions may handle the fault and force do_page_fault() to return
immediately.

This functionality is similar to the now removed page fault notifiers.
Custom page fault handlers are used by debugging and reverse engineering
tools. Mmiotrace is one such tool and a patch to add it into the tree
will follow.

The custom page fault handlers are called earlier in do_page_fault()
than the page fault notifiers were.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/fault.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75..343f5c1aacc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -49,6 +49,60 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 
+#ifdef CONFIG_PAGE_FAULT_HANDLERS
+static HLIST_HEAD(pf_handlers); /* protected by RCU */
+static DEFINE_SPINLOCK(pf_handlers_writer);
+
+void register_page_fault_handler(struct pf_handler *new_pfh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pf_handlers_writer, flags);
+	hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers);
+	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_handler);
+
+/**
+ * unregister_page_fault_handler:
+ * The caller must ensure @old_pfh is not in use anymore before freeing it.
+ * This function does not guarantee it. The list of handlers is protected by
+ * RCU, so you can do this by e.g. calling synchronize_rcu().
+ */
+void unregister_page_fault_handler(struct pf_handler *old_pfh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pf_handlers_writer, flags);
+	hlist_del_rcu(&old_pfh->hlist);
+	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_handler);
+#endif
+
+/* returns non-zero if do_page_fault() should return */
+static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code,
+							unsigned long address)
+{
+#ifdef CONFIG_PAGE_FAULT_HANDLERS
+	int ret = 0;
+	struct pf_handler *cur;
+	struct hlist_node *ncur;
+
+	if (hlist_empty(&pf_handlers))
+		return 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) {
+		ret = cur->handler(regs, error_code, address);
+		if (ret)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+#else
+	return 0;
+#endif
+}
+
 static inline int notify_page_fault(struct pt_regs *regs)
 {
 #ifdef CONFIG_KPROBES
@@ -601,6 +655,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (notify_page_fault(regs))
 		return;
+	if (handle_custom_pf(regs, error_code, address))
+		return;
 
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
-- 
cgit v1.2.3-70-g09d2


From 10c43d2eb50c9a5ad60388b9d3c41c31150049e6 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86: explicit call to mmiotrace in do_page_fault()

The custom page fault handler list is replaced with a single function
pointer. All related functions and variables are renamed for
mmiotrace.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: pq@iki.fi
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug            | 14 ++++-----
 arch/x86/kernel/mmiotrace/kmmio.c | 14 ++++-----
 arch/x86/mm/fault.c               | 66 ++++++++++++++++++++-------------------
 include/asm-x86/kdebug.h          | 12 +++----
 4 files changed, 52 insertions(+), 54 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7c6496e2225..9491c0ae03a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -168,20 +168,18 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config PAGE_FAULT_HANDLERS
-	bool "Custom page fault handlers"
-	depends on DEBUG_KERNEL
-	help
-	  Allow the use of custom page fault handlers. A kernel module may
-	  register a function that is called on every page fault. Custom
-	  handlers are used by some debugging and reverse engineering tools.
+config MMIOTRACE_HOOKS
+	bool
+	default n
 
 config MMIOTRACE
 	tristate "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && PAGE_FAULT_HANDLERS && RELAY && DEBUG_FS
+	depends on DEBUG_KERNEL && RELAY && DEBUG_FS
+	select MMIOTRACE_HOOKS
 	default n
 	help
 	  This will build a kernel module called mmiotrace.
+	  Making this a built-in is heavily discouraged.
 
 	  Mmiotrace traces Memory Mapped I/O access and is meant for debugging
 	  and reverse engineering. The kernel module offers wrapped
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 28411dadb8b..e759f7c3878 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -51,10 +51,6 @@ static LIST_HEAD(kmmio_probes);
 
 static struct kmmio_context kmmio_ctx[NR_CPUS];
 
-static struct pf_handler kmmio_pf_hook = {
-	.handler = kmmio_page_fault
-};
-
 static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
@@ -77,7 +73,8 @@ void cleanup_kmmio(void)
 	 * kmmio_page_table, kmmio_probes
 	 */
 	if (handler_registered) {
-		unregister_page_fault_handler(&kmmio_pf_hook);
+		if (mmiotrace_unregister_pf(&kmmio_page_fault))
+			BUG();
 		synchronize_rcu();
 	}
 	unregister_die_notifier(&nb_die);
@@ -343,8 +340,11 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	}
 
 	if (!handler_registered) {
-		register_page_fault_handler(&kmmio_pf_hook);
-		handler_registered++;
+		if (mmiotrace_register_pf(&kmmio_page_fault))
+			printk(KERN_ERR "mmiotrace: Cannot register page "
+					"fault handler.\n");
+		else
+			handler_registered++;
 	}
 
 out:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 343f5c1aacc..e9a086a1a9f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -49,53 +49,55 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 
-#ifdef CONFIG_PAGE_FAULT_HANDLERS
-static HLIST_HEAD(pf_handlers); /* protected by RCU */
-static DEFINE_SPINLOCK(pf_handlers_writer);
+#ifdef CONFIG_MMIOTRACE_HOOKS
+static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */
+static DEFINE_SPINLOCK(mmiotrace_handler_lock);
 
-void register_page_fault_handler(struct pf_handler *new_pfh)
+int mmiotrace_register_pf(pf_handler_func new_pfh)
 {
+	int ret = 0;
 	unsigned long flags;
-	spin_lock_irqsave(&pf_handlers_writer, flags);
-	hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers);
-	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
+	if (mmiotrace_pf_handler)
+		ret = -EBUSY;
+	else
+		mmiotrace_pf_handler = new_pfh;
+	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(register_page_fault_handler);
+EXPORT_SYMBOL_GPL(mmiotrace_register_pf);
 
 /**
- * unregister_page_fault_handler:
+ * mmiotrace_unregister_pf:
  * The caller must ensure @old_pfh is not in use anymore before freeing it.
- * This function does not guarantee it. The list of handlers is protected by
- * RCU, so you can do this by e.g. calling synchronize_rcu().
+ * This function does not guarantee it. The handler function pointer is
+ * protected by RCU, so you can do this by e.g. calling synchronize_rcu().
  */
-void unregister_page_fault_handler(struct pf_handler *old_pfh)
+int mmiotrace_unregister_pf(pf_handler_func old_pfh)
 {
+	int ret = 0;
 	unsigned long flags;
-	spin_lock_irqsave(&pf_handlers_writer, flags);
-	hlist_del_rcu(&old_pfh->hlist);
-	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
+	if (mmiotrace_pf_handler != old_pfh)
+		ret = -EPERM;
+	else
+		mmiotrace_pf_handler = NULL;
+	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(unregister_page_fault_handler);
-#endif
+EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf);
+#endif /* CONFIG_MMIOTRACE_HOOKS */
 
 /* returns non-zero if do_page_fault() should return */
-static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code,
-							unsigned long address)
+static inline int call_mmiotrace(struct pt_regs *regs,
+					unsigned long error_code,
+					unsigned long address)
 {
-#ifdef CONFIG_PAGE_FAULT_HANDLERS
+#ifdef CONFIG_MMIOTRACE_HOOKS
 	int ret = 0;
-	struct pf_handler *cur;
-	struct hlist_node *ncur;
-
-	if (hlist_empty(&pf_handlers))
-		return 0;
-
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) {
-		ret = cur->handler(regs, error_code, address);
-		if (ret)
-			break;
-	}
+	if (mmiotrace_pf_handler)
+		ret = mmiotrace_pf_handler(regs, error_code, address);
 	rcu_read_unlock();
 	return ret;
 #else
@@ -655,7 +657,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (notify_page_fault(regs))
 		return;
-	if (handle_custom_pf(regs, error_code, address))
+	if (call_mmiotrace(regs, error_code, address))
 		return;
 
 	/*
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index a80f2d6cc73..7063281040d 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -35,13 +35,11 @@ extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
-struct pf_handler {
-	struct hlist_node hlist;
-	int (*handler)(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address);
-};
+typedef int (*pf_handler_func)(struct pt_regs *regs,
+				unsigned long error_code,
+				unsigned long address);
 
-extern void register_page_fault_handler(struct pf_handler *new_pfh);
-extern void unregister_page_fault_handler(struct pf_handler *old_pfh);
+extern int mmiotrace_register_pf(pf_handler_func new_pfh);
+extern int mmiotrace_unregister_pf(pf_handler_func old_pfh);
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 0fd0e3da4557c479b820b9a4a7afa25b4637ddf2 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86: mmiotrace full patch, preview 1

kmmio.c handles the list of mmio probes with callbacks, list of traced
pages, and attaching into the page fault handler and die notifier. It
arms, traps and disarms the given pages, this is the core of mmiotrace.

mmio-mod.c is a user interface, hooking into ioremap functions and
registering the mmio probes. It also decodes the required information
from trapped mmio accesses via the pre and post callbacks in each probe.
Currently, hooking into ioremap functions works by redefining the symbols
of the target (binary) kernel module, so that it calls the traced
versions of the functions.

The most notable changes done since the last discussion are:
- kmmio.c is a built-in, not part of the module
- direct call from fault.c to kmmio.c, removing all dynamic hooks
- prepare for unregistering probes at any time
- make kmmio re-initializable and accessible to more than one user
- rewrite kmmio locking to remove all spinlocks from page fault path

Can I abuse call_rcu() like I do in kmmio.c:unregister_kmmio_probe()
or is there a better way?

The function called via call_rcu() itself calls call_rcu() again,
will this work or break? There I need a second grace period for RCU
after the first grace period for page faults.

Mmiotrace itself (mmio-mod.c) is still a module, I am going to attack
that next. At some point I will start looking into how to make mmiotrace
a tracer component of ftrace (thanks for the hint, Ingo). Ftrace should
make the user space part of mmiotracing as simple as
'cat /debug/trace/mmio > dump.txt'.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/init_task.c               |   1 -
 arch/x86/kernel/mmiotrace/Makefile        |   8 +-
 arch/x86/kernel/mmiotrace/kmmio.c         | 349 ++++++++++++++++++++----------
 arch/x86/kernel/mmiotrace/kmmio.h         |  58 -----
 arch/x86/kernel/mmiotrace/mmio-mod.c      |  81 ++++---
 arch/x86/kernel/mmiotrace/pf_in.c         |   2 +-
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  13 +-
 arch/x86/mm/fault.c                       |  59 +----
 include/asm-x86/kdebug.h                  |   7 -
 include/linux/mmiotrace.h                 |  38 ++++
 10 files changed, 335 insertions(+), 281 deletions(-)
 delete mode 100644 arch/x86/kernel/mmiotrace/kmmio.h

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 027a5b6a12b..a4f93b4120c 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
 EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
-EXPORT_SYMBOL_GPL(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
index d6905f7f981..cf1e747b463 100644
--- a/arch/x86/kernel/mmiotrace/Makefile
+++ b/arch/x86/kernel/mmiotrace/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
-mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o
-
-obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
+obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
+obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
+mmiotrace-objs			:= pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 5e239d0b846..539a9b19588 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/version.h>
+#include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
@@ -17,70 +18,119 @@
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
 #include <linux/percpu.h>
+#include <linux/kdebug.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include "kmmio.h"
+#include <linux/mmiotrace.h>
 
-#define KMMIO_HASH_BITS 6
-#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS)
 #define KMMIO_PAGE_HASH_BITS 4
 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
 
+struct kmmio_fault_page {
+	struct list_head list;
+	struct kmmio_fault_page *release_next;
+	unsigned long page; /* location of the fault page */
+
+	/*
+	 * Number of times this page has been registered as a part
+	 * of a probe. If zero, page is disarmed and this may be freed.
+	 * Used only by writers (RCU).
+	 */
+	int count;
+};
+
+struct kmmio_delayed_release {
+	struct rcu_head rcu;
+	struct kmmio_fault_page *release_list;
+};
+
 struct kmmio_context {
 	struct kmmio_fault_page *fpage;
 	struct kmmio_probe *probe;
 	unsigned long saved_flags;
+	unsigned long addr;
 	int active;
 };
 
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address);
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args);
 
+static DECLARE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 
 /* These are protected by kmmio_lock */
+static int kmmio_initialized;
 unsigned int kmmio_count;
-static unsigned int handler_registered;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
 
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+
 /* Accessed per-cpu */
 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
 
+/* protected by kmmio_init_mutex */
 static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
 
-int init_kmmio(void)
+/**
+ * Makes sure kmmio is initialized and usable.
+ * This must be called before any other kmmio function defined here.
+ * May sleep.
+ */
+void reference_kmmio(void)
 {
-	int i;
-	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
-		INIT_LIST_HEAD(&kmmio_page_table[i]);
-
-	register_die_notifier(&nb_die);
-	return 0;
+	down(&kmmio_init_mutex);
+	spin_lock_irq(&kmmio_lock);
+	if (!kmmio_initialized) {
+		int i;
+		for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+			INIT_LIST_HEAD(&kmmio_page_table[i]);
+		if (register_die_notifier(&nb_die))
+			BUG();
+	}
+	kmmio_initialized++;
+	spin_unlock_irq(&kmmio_lock);
+	up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL_GPL(reference_kmmio);
 
-void cleanup_kmmio(void)
+/**
+ * Clean up kmmio after use. This must be called for every call to
+ * reference_kmmio(). All probes registered after the corresponding
+ * reference_kmmio() must have been unregistered when calling this.
+ * May sleep.
+ */
+void unreference_kmmio(void)
 {
-	/*
-	 * Assume the following have been already cleaned by calling
-	 * unregister_kmmio_probe() appropriately:
-	 * kmmio_page_table, kmmio_probes
-	 */
-	if (handler_registered) {
-		if (mmiotrace_unregister_pf(&kmmio_page_fault))
-			BUG();
-		synchronize_rcu();
+	bool unreg = false;
+
+	down(&kmmio_init_mutex);
+	spin_lock_irq(&kmmio_lock);
+
+	if (kmmio_initialized == 1) {
+		BUG_ON(is_kmmio_active());
+		unreg = true;
 	}
-	unregister_die_notifier(&nb_die);
+	kmmio_initialized--;
+	BUG_ON(kmmio_initialized < 0);
+	spin_unlock_irq(&kmmio_lock);
+
+	if (unreg)
+		unregister_die_notifier(&nb_die); /* calls sync_rcu() */
+	up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL(unreference_kmmio);
 
 /*
  * this is basically a dynamic stabbing problem:
@@ -90,33 +140,33 @@ void cleanup_kmmio(void)
  * Overlap a Point (might be simple)
  * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
  */
-/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
 static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 {
 	struct kmmio_probe *p;
-	list_for_each_entry(p, &kmmio_probes, list) {
+	list_for_each_entry_rcu(p, &kmmio_probes, list) {
 		if (addr >= p->addr && addr <= (p->addr + p->len))
 			return p;
 	}
 	return NULL;
 }
 
+/* You must be holding RCU read lock. */
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
-	struct list_head *head, *tmp;
+	struct list_head *head;
+	struct kmmio_fault_page *p;
 
 	page &= PAGE_MASK;
-	head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
-	list_for_each(tmp, head) {
-		struct kmmio_fault_page *p
-			= list_entry(tmp, struct kmmio_fault_page, list);
+	head = kmmio_page_list(page);
+	list_for_each_entry_rcu(p, head, list) {
 		if (p->page == page)
 			return p;
 	}
-
 	return NULL;
 }
 
+/** Mark the given page as not present. Access to it will trigger a fault. */
 static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
@@ -124,8 +174,8 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, page);
+		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
+							__func__, page);
 		return;
 	}
 
@@ -143,6 +193,7 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 	__flush_tlb_one(page);
 }
 
+/** Mark the given page as present. */
 static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
@@ -150,8 +201,8 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, page);
+		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
+							__func__, page);
 		return;
 	}
 
@@ -169,13 +220,25 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 	__flush_tlb_one(page);
 }
 
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate
  * and they remain disabled thorough out this function.
  */
-static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
-	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+	struct kmmio_context *ctx;
+	struct kmmio_fault_page *faultpage;
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -186,40 +249,40 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 * XXX what if an interrupt occurs between returning from
 	 * do_page_fault() and entering the single-step exception handler?
 	 * And that interrupt triggers a kmmio trap?
+	 * XXX If we tracing an interrupt service routine or whatever, is
+	 * this enough to keep it on the current cpu?
 	 */
 	preempt_disable();
 
-	/* interrupts disabled and CPU-local data => atomicity guaranteed. */
+	rcu_read_lock();
+	faultpage = get_kmmio_fault_page(addr);
+	if (!faultpage) {
+		/*
+		 * Either this page fault is not caused by kmmio, or
+		 * another CPU just pulled the kmmio probe from under
+		 * our feet. In the latter case all hell breaks loose.
+		 */
+		goto no_kmmio;
+	}
+
+	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
 		/*
-		 * This avoids a deadlock with kmmio_lock.
+		 * Prevent overwriting already in-flight context.
 		 * If this page fault really was due to kmmio trap,
 		 * all hell breaks loose.
 		 */
-		printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, "
-					"for address %lu. Ignoring.\n",
+		pr_emerg("kmmio: recursive probe hit on CPU %d, "
+					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
-		goto no_kmmio;
+		goto no_kmmio_ctx;
 	}
 	ctx->active++;
 
-	/*
-	 * Acquire the kmmio lock to prevent changes affecting
-	 * get_kmmio_fault_page() and get_kmmio_probe(), since we save their
-	 * returned pointers.
-	 * The lock is released in post_kmmio_handler().
-	 * XXX: could/should get_kmmio_*() be using RCU instead of spinlock?
-	 */
-	spin_lock(&kmmio_lock);
-
-	ctx->fpage = get_kmmio_fault_page(addr);
-	if (!ctx->fpage) {
-		/* this page fault is not caused by kmmio */
-		goto no_kmmio_locked;
-	}
-
+	ctx->fpage = faultpage;
 	ctx->probe = get_kmmio_probe(addr);
 	ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK));
+	ctx->addr = addr;
 
 	if (ctx->probe && ctx->probe->pre_handler)
 		ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -227,46 +290,62 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	regs->flags |= TF_MASK;
 	regs->flags &= ~IF_MASK;
 
-	/* We hold lock, now we set present bit in PTE and single step. */
+	/* Now we set present bit in PTE and single step. */
 	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	put_cpu_var(kmmio_ctx);
+	rcu_read_unlock();
 	return 1;
 
-no_kmmio_locked:
-	spin_unlock(&kmmio_lock);
-	ctx->active--;
+no_kmmio_ctx:
+	put_cpu_var(kmmio_ctx);
 no_kmmio:
+	rcu_read_unlock();
 	preempt_enable_no_resched();
-	put_cpu_var(kmmio_ctx);
-	/* page fault not handled by kmmio */
-	return 0;
+	return 0; /* page fault not handled by kmmio */
 }
 
 /*
  * Interrupts are disabled on entry as trap1 is an interrupt gate
  * and they remain disabled thorough out this function.
- * And we hold kmmio lock.
+ * This must always get called as the pair to kmmio_handler().
  */
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 {
 	int ret = 0;
+	struct kmmio_probe *probe;
+	struct kmmio_fault_page *faultpage;
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active)
 		goto out;
 
+	rcu_read_lock();
+
+	faultpage = get_kmmio_fault_page(ctx->addr);
+	probe = get_kmmio_probe(ctx->addr);
+	if (faultpage != ctx->fpage || probe != ctx->probe) {
+		/*
+		 * The trace setup changed after kmmio_handler() and before
+		 * running this respective post handler. User does not want
+		 * the result anymore.
+		 */
+		ctx->probe = NULL;
+		ctx->fpage = NULL;
+	}
+
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+	if (ctx->fpage)
+		arm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	regs->flags &= ~TF_MASK;
 	regs->flags |= ctx->saved_flags;
 
 	/* These were acquired in kmmio_handler(). */
 	ctx->active--;
-	spin_unlock(&kmmio_lock);
+	BUG_ON(ctx->active);
 	preempt_enable_no_resched();
 
 	/*
@@ -277,11 +356,13 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	if (!(regs->flags & TF_MASK))
 		ret = 1;
 
+	rcu_read_unlock();
 out:
 	put_cpu_var(kmmio_ctx);
 	return ret;
 }
 
+/* You must be holding kmmio_lock. */
 static int add_kmmio_fault_page(unsigned long page)
 {
 	struct kmmio_fault_page *f;
@@ -289,6 +370,8 @@ static int add_kmmio_fault_page(unsigned long page)
 	page &= PAGE_MASK;
 	f = get_kmmio_fault_page(page);
 	if (f) {
+		if (!f->count)
+			arm_kmmio_fault_page(f->page, NULL);
 		f->count++;
 		return 0;
 	}
@@ -299,15 +382,16 @@ static int add_kmmio_fault_page(unsigned long page)
 
 	f->count = 1;
 	f->page = page;
-	list_add(&f->list,
-		 &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]);
+	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
 	arm_kmmio_fault_page(f->page, NULL);
 
 	return 0;
 }
 
-static void release_kmmio_fault_page(unsigned long page)
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+				struct kmmio_fault_page **release_list)
 {
 	struct kmmio_fault_page *f;
 
@@ -317,9 +401,11 @@ static void release_kmmio_fault_page(unsigned long page)
 		return;
 
 	f->count--;
+	BUG_ON(f->count < 0);
 	if (!f->count) {
 		disarm_kmmio_fault_page(f->page, NULL);
-		list_del(&f->list);
+		f->release_next = *release_list;
+		*release_list = f;
 	}
 }
 
@@ -334,68 +420,113 @@ int register_kmmio_probe(struct kmmio_probe *p)
 		ret = -EEXIST;
 		goto out;
 	}
-	list_add(&p->list, &kmmio_probes);
-	/*printk("adding fault pages...\n");*/
+	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < p->len) {
 		if (add_kmmio_fault_page(p->addr + size))
-			printk(KERN_ERR "mmio: Unable to set page fault.\n");
+			pr_err("kmmio: Unable to set page fault.\n");
 		size += PAGE_SIZE;
 	}
-
-	if (!handler_registered) {
-		if (mmiotrace_register_pf(&kmmio_page_fault))
-			printk(KERN_ERR "mmiotrace: Cannot register page "
-					"fault handler.\n");
-		else
-			handler_registered++;
-	}
-
 out:
 	spin_unlock_irq(&kmmio_lock);
 	/*
 	 * XXX: What should I do here?
 	 * Here was a call to global_flush_tlb(), but it does not exist
-	 * anymore.
+	 * anymore. It seems it's not needed after all.
 	 */
 	return ret;
 }
+EXPORT_SYMBOL(register_kmmio_probe);
 
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	while (p) {
+		struct kmmio_fault_page *next = p->release_next;
+		BUG_ON(p->count);
+		kfree(p);
+		p = next;
+	}
+	kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	struct kmmio_fault_page **prevp = &dr->release_list;
+	unsigned long flags;
+	spin_lock_irqsave(&kmmio_lock, flags);
+	while (p) {
+		if (!p->count)
+			list_del_rcu(&p->list);
+		else
+			*prevp = p->release_next;
+		prevp = &p->release_next;
+		p = p->release_next;
+	}
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+	/* This is the real RCU destroy call. */
+	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
 void unregister_kmmio_probe(struct kmmio_probe *p)
 {
 	unsigned long size = 0;
+	struct kmmio_fault_page *release_list = NULL;
+	struct kmmio_delayed_release *drelease;
 
 	spin_lock_irq(&kmmio_lock);
 	while (size < p->len) {
-		release_kmmio_fault_page(p->addr + size);
+		release_kmmio_fault_page(p->addr + size, &release_list);
 		size += PAGE_SIZE;
 	}
-	list_del(&p->list);
+	list_del_rcu(&p->list);
 	kmmio_count--;
 	spin_unlock_irq(&kmmio_lock);
-}
 
-/*
- * According to 2.6.20, mainly x86_64 arch:
- * This is being called from do_page_fault(), via the page fault notifier
- * chain. The chain is called for both user space faults and kernel space
- * faults (address >= TASK_SIZE64), except not on faults serviced by
- * vmalloc_fault().
- *
- * We may be in an interrupt or a critical section. Also prefecthing may
- * trigger a page fault. We may be in the middle of process switch.
- * The page fault hook functionality has put us inside RCU read lock.
- *
- * Local interrupts are disabled, so preemption cannot happen.
- * Do not enable interrupts, do not sleep, and watch out for other CPUs.
- */
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address)
-{
-	if (is_kmmio_active())
-		if (kmmio_handler(regs, address) == 1)
-			return -1;
-	return 0;
+	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+	if (!drelease) {
+		pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+		return;
+	}
+	drelease->release_list = release_list;
+
+	/*
+	 * This is not really RCU here. We have just disarmed a set of
+	 * pages so that they cannot trigger page faults anymore. However,
+	 * we cannot remove the pages from kmmio_page_table,
+	 * because a probe hit might be in flight on another CPU. The
+	 * pages are collected into a list, and they will be removed from
+	 * kmmio_page_table when it is certain that no probe hit related to
+	 * these pages can be in flight. RCU grace period sounds like a
+	 * good choice.
+	 *
+	 * If we removed the pages too early, kmmio page fault handler might
+	 * not find the respective kmmio_fault_page and determine it's not
+	 * a kmmio fault, when it actually is. This would lead to madness.
+	 */
+	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
 }
+EXPORT_SYMBOL(unregister_kmmio_probe);
 
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args)
diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h
deleted file mode 100644
index 85b7f68a3b8..00000000000
--- a/arch/x86/kernel/mmiotrace/kmmio.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef _LINUX_KMMIO_H
-#define _LINUX_KMMIO_H
-
-#include <linux/list.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/version.h>
-#include <linux/kdebug.h>
-
-struct kmmio_probe;
-struct kmmio_fault_page;
-struct pt_regs;
-
-typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
-				struct pt_regs *, unsigned long addr);
-typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
-				unsigned long condition, struct pt_regs *);
-
-struct kmmio_probe {
-	struct list_head list;
-
-	/* start location of the probe point */
-	unsigned long addr;
-
-	/* length of the probe region */
-	unsigned long len;
-
-	 /* Called before addr is executed. */
-	kmmio_pre_handler_t pre_handler;
-
-	/* Called after addr is executed, unless... */
-	kmmio_post_handler_t post_handler;
-};
-
-struct kmmio_fault_page {
-	struct list_head list;
-
-	/* location of the fault page */
-	unsigned long page;
-
-	int count;
-};
-
-/* kmmio is active by some kmmio_probes? */
-static inline int is_kmmio_active(void)
-{
-	extern unsigned int kmmio_count;
-	return kmmio_count;
-}
-
-int init_kmmio(void);
-void cleanup_kmmio(void);
-int register_kmmio_probe(struct kmmio_probe *p);
-void unregister_kmmio_probe(struct kmmio_probe *p);
-
-#endif /* _LINUX_KMMIO_H */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index f9c609266d8..e1a508588f0 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -32,7 +32,6 @@
 #include <asm/atomic.h>
 #include <linux/percpu.h>
 
-#include "kmmio.h"
 #include "pf_in.h"
 
 /* This app's relay channel files will appear in /debug/mmio-trace */
@@ -129,18 +128,17 @@ static void print_pte(unsigned long address)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, address);
+		pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n",
+							__func__, address);
 		return;
 	}
 
 	if (level == PG_LEVEL_2M) {
-		printk(KERN_EMERG MODULE_NAME ": 4MB pages are not "
-						"currently supported: %lx\n",
-						address);
+		pr_emerg(MODULE_NAME ": 4MB pages are not currently "
+						"supported: %lx\n", address);
 		BUG();
 	}
-	printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
+	pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
 					address, pte_val(*pte),
 					pte_val(*pte) & _PAGE_PRESENT);
 }
@@ -152,7 +150,7 @@ static void print_pte(unsigned long address)
 static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 {
 	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, "
+	pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, "
 					"last fault for address: %lx\n",
 					addr, my_reason->addr);
 	print_pte(addr);
@@ -160,20 +158,17 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
 							my_reason->ip);
-	printk(KERN_EMERG
-			"eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
 			regs->ax, regs->bx, regs->cx, regs->dx);
-	printk(KERN_EMERG
-			"esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 			regs->si, regs->di, regs->bp, regs->sp);
 #else
 	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
 							my_reason->ip);
-	printk(KERN_EMERG "rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
 					regs->ax, regs->cx, regs->dx);
-	printk(KERN_EMERG "rsi: %016lx   rdi: %016lx   "
-				"rbp: %016lx   rsp: %016lx\n",
+	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
 				regs->si, regs->di, regs->bp, regs->sp);
 #endif
 	put_cpu_var(pf_reason);
@@ -251,10 +246,15 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
 	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
 
+	/*
+	 * XXX: This might not get called, if the probe is removed while
+	 * trace hit is on flight.
+	 */
+
 	/* this should always return the active_trace count to 0 */
 	my_reason->active_traces--;
 	if (my_reason->active_traces) {
-		printk(KERN_EMERG MODULE_NAME ": unexpected post handler");
+		pr_emerg(MODULE_NAME ": unexpected post handler");
 		BUG();
 	}
 
@@ -283,16 +283,15 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
 	atomic_t *drop = &per_cpu(dropped, cpu);
 	int count;
 	if (relay_buf_full(buf)) {
-		if (atomic_inc_return(drop) == 1) {
-			printk(KERN_ERR MODULE_NAME ": cpu %d buffer full!\n",
-									cpu);
-		}
+		if (atomic_inc_return(drop) == 1)
+			pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu);
 		return 0;
-	} else if ((count = atomic_read(drop))) {
-		printk(KERN_ERR MODULE_NAME
-					": cpu %d buffer no longer full, "
-					"missed %d events.\n",
-					cpu, count);
+	}
+	count = atomic_read(drop);
+	if (count) {
+		pr_err(MODULE_NAME ": cpu %d buffer no longer full, "
+						"missed %d events.\n",
+						cpu, count);
 		atomic_sub(count, drop);
 	}
 
@@ -407,8 +406,8 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 	/* Don't trace the low PCI/ISA area, it's always mapped.. */
 	if (!ISA_trace && (offset < ISA_END_ADDRESS) &&
 					(offset + size > ISA_START_ADDRESS)) {
-		printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low "
-						"PCI/ISA area (0x%lx-0x%lx)\n",
+		pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area "
+						"(0x%lx-0x%lx)\n",
 						offset, offset + size);
 		return;
 	}
@@ -418,7 +417,7 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size)
 {
 	void __iomem *p = ioremap_cache(offset, size);
-	printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
+	pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
 							offset, size, p);
 	ioremap_trace_core(offset, size, p);
 	return p;
@@ -428,7 +427,7 @@ EXPORT_SYMBOL(ioremap_cache_trace);
 void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size)
 {
 	void __iomem *p = ioremap_nocache(offset, size);
-	printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
+	pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
 							offset, size, p);
 	ioremap_trace_core(offset, size, p);
 	return p;
@@ -455,7 +454,7 @@ void iounmap_trace(volatile void __iomem *addr)
 	};
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
-	printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr);
+	pr_debug(MODULE_NAME ": Unmapping %p.\n", addr);
 	record_timestamp(&event.header);
 
 	spin_lock(&trace_list_lock);
@@ -481,7 +480,7 @@ static void clear_trace_list(void)
 
 	spin_lock(&trace_list_lock);
 	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
-		printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped "
+		pr_warning(MODULE_NAME ": purging non-iounmapped "
 					"trace @0x%08lx, size 0x%lx.\n",
 					trace->probe.addr, trace->probe.len);
 		if (!nommiotrace)
@@ -500,39 +499,37 @@ static int __init init(void)
 
 	dir = debugfs_create_dir(APP_DIR, NULL);
 	if (!dir) {
-		printk(KERN_ERR MODULE_NAME
-				": Couldn't create relay app directory.\n");
+		pr_err(MODULE_NAME ": Couldn't create relay app directory.\n");
 		return -ENOMEM;
 	}
 
 	chan = create_channel(subbuf_size, n_subbufs);
 	if (!chan) {
 		debugfs_remove(dir);
-		printk(KERN_ERR MODULE_NAME
-				": relay app channel creation failed\n");
+		pr_err(MODULE_NAME ": relay app channel creation failed\n");
 		return -ENOMEM;
 	}
 
-	init_kmmio();
+	reference_kmmio();
 
 	proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL);
 	if (proc_marker_file)
 		proc_marker_file->write_proc = write_marker;
 
-	printk(KERN_DEBUG MODULE_NAME ": loaded.\n");
+	pr_debug(MODULE_NAME ": loaded.\n");
 	if (nommiotrace)
-		printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n");
+		pr_info(MODULE_NAME ": MMIO tracing disabled.\n");
 	if (ISA_trace)
-		printk(KERN_WARNING MODULE_NAME
-				": Warning! low ISA range will be traced.\n");
+		pr_warning(MODULE_NAME ": Warning! low ISA range will be "
+								"traced.\n");
 	return 0;
 }
 
 static void __exit cleanup(void)
 {
-	printk(KERN_DEBUG MODULE_NAME ": unload...\n");
+	pr_debug(MODULE_NAME ": unload...\n");
 	clear_trace_list();
-	cleanup_kmmio();
+	unreference_kmmio();
 	remove_proc_entry(MARKER_FILE, NULL);
 	destroy_channel();
 	if (dir)
diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c
index 67ea520dde6..efa1911e20c 100644
--- a/arch/x86/kernel/mmiotrace/pf_in.c
+++ b/arch/x86/kernel/mmiotrace/pf_in.c
@@ -19,7 +19,7 @@
  *
  */
 
-/*  $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $
+/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
  *  Copyright by Intel Crop., 2002
  *  Louis Zhuang (louis.zhuang@intel.com)
  *
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
index 40e66b0e648..5ecff578672 100644
--- a/arch/x86/kernel/mmiotrace/testmmiotrace.c
+++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c
@@ -41,8 +41,7 @@ static void do_test(void)
 {
 	void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000);
 	if (!p) {
-		printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, "
-							"aborting.\n");
+		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
 	}
 	do_write_test(p);
@@ -53,14 +52,14 @@ static void do_test(void)
 static int __init init(void)
 {
 	if (mmio_address == 0) {
-		printk(KERN_ERR MODULE_NAME ": you have to use the module "
-						"argument mmio_address.\n");
-		printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+		pr_err(MODULE_NAME ": you have to use the module argument "
+							"mmio_address.\n");
+		pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
 				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
 		return -ENXIO;
 	}
 
-	printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
 					"in PCI address space, and writing "
 					"rubbish in there.\n", mmio_address);
 	do_test();
@@ -69,7 +68,7 @@ static int __init init(void)
 
 static void __exit cleanup(void)
 {
-	printk(KERN_DEBUG MODULE_NAME ": unloaded.\n");
+	pr_debug(MODULE_NAME ": unloaded.\n");
 }
 
 module_init(init);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e9a086a1a9f..8c828a68d3b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/mmiotrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -49,60 +50,14 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 
-#ifdef CONFIG_MMIOTRACE_HOOKS
-static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */
-static DEFINE_SPINLOCK(mmiotrace_handler_lock);
-
-int mmiotrace_register_pf(pf_handler_func new_pfh)
-{
-	int ret = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
-	if (mmiotrace_pf_handler)
-		ret = -EBUSY;
-	else
-		mmiotrace_pf_handler = new_pfh;
-	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mmiotrace_register_pf);
-
-/**
- * mmiotrace_unregister_pf:
- * The caller must ensure @old_pfh is not in use anymore before freeing it.
- * This function does not guarantee it. The handler function pointer is
- * protected by RCU, so you can do this by e.g. calling synchronize_rcu().
- */
-int mmiotrace_unregister_pf(pf_handler_func old_pfh)
-{
-	int ret = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
-	if (mmiotrace_pf_handler != old_pfh)
-		ret = -EPERM;
-	else
-		mmiotrace_pf_handler = NULL;
-	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf);
-#endif /* CONFIG_MMIOTRACE_HOOKS */
-
-/* returns non-zero if do_page_fault() should return */
-static inline int call_mmiotrace(struct pt_regs *regs,
-					unsigned long error_code,
-					unsigned long address)
+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
 #ifdef CONFIG_MMIOTRACE_HOOKS
-	int ret = 0;
-	rcu_read_lock();
-	if (mmiotrace_pf_handler)
-		ret = mmiotrace_pf_handler(regs, error_code, address);
-	rcu_read_unlock();
-	return ret;
-#else
-	return 0;
+	if (unlikely(is_kmmio_active()))
+		if (kmmio_handler(regs, addr) == 1)
+			return -1;
 #endif
+	return 0;
 }
 
 static inline int notify_page_fault(struct pt_regs *regs)
@@ -657,7 +612,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (notify_page_fault(regs))
 		return;
-	if (call_mmiotrace(regs, error_code, address))
+	if (unlikely(kmmio_fault(regs, address)))
 		return;
 
 	/*
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index 7063281040d..96651bb59ba 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -35,11 +35,4 @@ extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
-typedef int (*pf_handler_func)(struct pt_regs *regs,
-				unsigned long error_code,
-				unsigned long address);
-
-extern int mmiotrace_register_pf(pf_handler_func new_pfh);
-extern int mmiotrace_unregister_pf(pf_handler_func old_pfh);
-
 #endif
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 6ec288f1fe2..d87a6cd8b68 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -3,6 +3,44 @@
 
 #include <asm/types.h>
 
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+
+struct kmmio_probe;
+struct pt_regs;
+
+typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
+				struct pt_regs *, unsigned long addr);
+typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
+				unsigned long condition, struct pt_regs *);
+
+struct kmmio_probe {
+	struct list_head list;
+	unsigned long addr; /* start location of the probe point */
+	unsigned long len; /* length of the probe region */
+	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
+	kmmio_post_handler_t post_handler; /* Called after addr is executed */
+};
+
+/* kmmio is active by some kmmio_probes? */
+static inline int is_kmmio_active(void)
+{
+	extern unsigned int kmmio_count;
+	return kmmio_count;
+}
+
+extern void reference_kmmio(void);
+extern void unreference_kmmio(void);
+extern int register_kmmio_probe(struct kmmio_probe *p);
+extern void unregister_kmmio_probe(struct kmmio_probe *p);
+
+/* Called from page fault handler. */
+extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+
 /*
  * If you change anything here, you must bump MMIO_VERSION.
  * This is the relay data format for user space.
-- 
cgit v1.2.3-70-g09d2


From b29c701deacd5d24453127c37ed77ef851c53b8b Mon Sep 17 00:00:00 2001
From: Henry Nestler <henry.nestler@gmail.com>
Date: Mon, 12 May 2008 15:44:39 +0200
Subject: x86: fix endless page faults in mount_block_root for Linux 2.6

Page faults in kernel address space between PAGE_OFFSET up to
VMALLOC_START should not try to map as vmalloc.

Fix rarely endless page faults inside mount_block_root for root
filesystem at boot time.

All 32bit kernels up to 2.6.25 can fail into this hole.
I can not present this under native linux kernel. I see, that the 64bit
has fixed the problem. I copied the same lines into 32bit part.

Recorded debugs are from coLinux kernel 2.6.22.18 (virtualisation):
http://www.henrynestler.com/colinux/testing/pfn-check-0.7.3/20080410-antinx/bug16-recursive-page-fault-endless.txt
The physicaly memory was trimmed down to 192MB to better catch the bug.
More memory gets the bug more rarely.

Details, how every x86 32bit system can fail:

Start from "mount_block_root",
http://lxr.linux.no/linux/init/do_mounts.c#L297
There the variable "fs_names" got one memory page with 4096 bytes.
Variable "p" walks through the existing file system types. The first
string is no problem.
But, with the second loop in mount_block_root the offset of "p" is not
at beginning of page, the offset is for example +9, if "reiserfs" is the
first in list.
Than calls do_mount_root, and lands in sys_mount.
Remember: Variable "type_page" contains now "fs_type+9" and not contains
a full page.
The sys_mount copies 4096 bytes with function "exact_copy_from_user()":
http://lxr.linux.no/linux/fs/namespace.c#L1540

Mostly exist pages after the buffer "fs_names+4096+9" and the page fault
handler was not called. No problem.

In the case, if the page after "fs_names+4096" is not mapped, the page
fault handler was called from http://lxr.linux.no/linux/fs/namespace.c#L1320

The do_page_fault gots an address 0xc03b4000.
It's kernel address, address >= TASK_SIZE, but not from vmalloc! It's
from "__getname()" alias "kmem_cache_alloc".
The "error_code" is 0. "vmalloc_fault" will be call:
http://lxr.linux.no/linux/arch/i386/mm/fault.c#L332

"vmalloc_fault" tryed to find the physical page for a non existing
virtual memory area. The macro "pte_present" in vmalloc_fault()
got a next page fault for 0xc0000ed0 at:
http://lxr.linux.no/linux/arch/i386/mm/fault.c#L282

No PTE exist for such virtual address. The page fault handler was trying
to sync the physical page for the PTE lockup.

This called vmalloc_fault() again for address 0xc000000, and that also
was not existing. The endless began...

In normal case the cpu would still loop with disabled interrrupts. Under
coLinux this was catched by a stack overflow inside printk debugs.

Signed-off-by: Henry Nestler <henry.nestler@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/fault.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75..8bcb6f40ccb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -497,6 +497,11 @@ static int vmalloc_fault(unsigned long address)
 	unsigned long pgd_paddr;
 	pmd_t *pmd_k;
 	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc area */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
 	/*
 	 * Synchronize this task's top level page-table
 	 * with the 'reference' page table.
-- 
cgit v1.2.3-70-g09d2


From f294a8ce211bed7bfaca19bef21376a86200c421 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Tue, 1 Jul 2008 15:38:13 +0200
Subject: x86: small unifications of address printing

'man 3 printf' tells me that %p should be printed as if by %#x, but
this is not true for the kernel, which does not use the '0x' prefix
for the %p conversion specifier.

A small cast to (void *) is also prettier than #ifdef/#else/#endif.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb..0eb70d1dd1f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -396,11 +396,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		printk(KERN_CONT "NULL pointer dereference");
 	else
 		printk(KERN_CONT "paging request");
-#ifdef CONFIG_X86_32
-	printk(KERN_CONT " at %08lx\n", address);
-#else
-	printk(KERN_CONT " at %016lx\n", address);
-#endif
+	printk(KERN_CONT " at %p\n", (void *) address);
 	printk(KERN_ALERT "IP:");
 	printk_address(regs->ip, 1);
 	dump_pagetable(address);
@@ -800,14 +796,10 @@ bad_area_nosemaphore:
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit()) {
 			printk(
-#ifdef CONFIG_X86_32
-			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
-#else
-			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
-#endif
+			"%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			tsk->comm, task_pid_nr(tsk), address, regs->ip,
-			regs->sp, error_code);
+			tsk->comm, task_pid_nr(tsk), address,
+			(void *) regs->ip, (void *) regs->sp, error_code);
 			print_vma_addr(" in ", regs->ip);
 			printk("\n");
 		}
-- 
cgit v1.2.3-70-g09d2


From 95c60b08c6af6db2165837139da10f593462d51c Mon Sep 17 00:00:00 2001
From: Gustavo Fernando Padovan <gustavo@las.ic.unicamp.br>
Date: Wed, 25 Jun 2008 04:03:19 -0300
Subject: x86: remove unnecessary #ifdef CONFIG_X86_32...#else

Remove the #ifdef conditional because this comparison is already done in
user_mode_vm().

Signed-off-by: Gustavo F. Padovan <gustavo@las.ic.unicamp.br>
Cc: akpm@osdl.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb..1e64795714c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -55,11 +55,7 @@ static inline int notify_page_fault(struct pt_regs *regs)
 	int ret = 0;
 
 	/* kprobe_running() needs smp_processor_id() */
-#ifdef CONFIG_X86_32
 	if (!user_mode_vm(regs)) {
-#else
-	if (!user_mode(regs)) {
-#endif
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
 			ret = 1;
-- 
cgit v1.2.3-70-g09d2


From 67350a5c4514c280665cdb45439d32a008a264ba Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:11 -0400
Subject: x86: simplify vmalloc_sync_all

vmalloc_sync_all() is only called from register_die_notifier and
alloc_vm_area.  Neither is on any performance-critical paths, so
vmalloc_sync_all() itself is not on any hot paths.

Given that the optimisations in vmalloc_sync_all add a fair amount of
code and complexity, and are fairly hard to evaluate for correctness,
it's better to just remove them to simplify the code rather than worry
about its absolute performance.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 77 ++++++++++++++++++-----------------------------------
 1 file changed, 26 insertions(+), 51 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 578b7681955..d0f5fce77d9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -903,14 +903,7 @@ LIST_HEAD(pgd_list);
 void vmalloc_sync_all(void)
 {
 #ifdef CONFIG_X86_32
-	/*
-	 * Note that races in the updates of insync and start aren't
-	 * problematic: insync can only get set bits added, and updates to
-	 * start are only improving performance (without affecting correctness
-	 * if undone).
-	 */
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-	static unsigned long start = TASK_SIZE;
+	unsigned long start = VMALLOC_START & PGDIR_MASK;
 	unsigned long address;
 
 	if (SHARED_KERNEL_PMD)
@@ -918,56 +911,38 @@ void vmalloc_sync_all(void)
 
 	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
 	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
-		if (!test_bit(pgd_index(address), insync)) {
-			unsigned long flags;
-			struct page *page;
-
-			spin_lock_irqsave(&pgd_lock, flags);
-			list_for_each_entry(page, &pgd_list, lru) {
-				if (!vmalloc_sync_one(page_address(page),
-						      address))
-					break;
-			}
-			spin_unlock_irqrestore(&pgd_lock, flags);
-			if (!page)
-				set_bit(pgd_index(address), insync);
+		unsigned long flags;
+		struct page *page;
+
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			if (!vmalloc_sync_one(page_address(page),
+					      address))
+				break;
 		}
-		if (address == start && test_bit(pgd_index(address), insync))
-			start = address + PGDIR_SIZE;
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
 #else /* CONFIG_X86_64 */
-	/*
-	 * Note that races in the updates of insync and start aren't
-	 * problematic: insync can only get set bits added, and updates to
-	 * start are only improving performance (without affecting correctness
-	 * if undone).
-	 */
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-	static unsigned long start = VMALLOC_START & PGDIR_MASK;
+	unsigned long start = VMALLOC_START & PGDIR_MASK;
 	unsigned long address;
 
 	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
-		if (!test_bit(pgd_index(address), insync)) {
-			const pgd_t *pgd_ref = pgd_offset_k(address);
-			unsigned long flags;
-			struct page *page;
-
-			if (pgd_none(*pgd_ref))
-				continue;
-			spin_lock_irqsave(&pgd_lock, flags);
-			list_for_each_entry(page, &pgd_list, lru) {
-				pgd_t *pgd;
-				pgd = (pgd_t *)page_address(page) + pgd_index(address);
-				if (pgd_none(*pgd))
-					set_pgd(pgd, *pgd_ref);
-				else
-					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-			}
-			spin_unlock_irqrestore(&pgd_lock, flags);
-			set_bit(pgd_index(address), insync);
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		unsigned long flags;
+		struct page *page;
+
+		if (pgd_none(*pgd_ref))
+			continue;
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			if (pgd_none(*pgd))
+				set_pgd(pgd, *pgd_ref);
+			else
+				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 		}
-		if (address == start)
-			start = address + PGDIR_SIZE;
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
 #endif
 }
-- 
cgit v1.2.3-70-g09d2