From b3ac891b67bd4b1fc728d1c784cad1212dea433d Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Wed, 24 Feb 2010 10:54:22 +0100
Subject: x86: Add support for lock prefix in alternatives

The current lock prefix UP/SMP alternative code doesn't allow
LOCK_PREFIX to be used in alternatives code.

This patch solves the problem by adding a new LOCK_PREFIX_ALTERNATIVE_PATCH
macro that only records the lock prefix location but does not emit
the prefix.

The user of this macro can then start any alternative sequence with
"lock" and have it UP/SMP patched.

To make this work, the UP/SMP alternative code is changed to do the
lock/DS prefix switching only if the byte actually contains a lock or
DS prefix.

Thus, if an alternative without the "lock" is selected, it will now do
nothing instead of clobbering the code.

Changes in v2:
- Naming change
- Change label to not conflict with alternatives

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/alternative.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2589ea4c60c..80b222ea4cf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -244,7 +244,8 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 		if (*ptr > text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		if (**ptr == 0x3e)
+			text_poke(*ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -263,7 +264,8 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 		if (*ptr > text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		if (**ptr == 0xf0)
+			text_poke(*ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
-- 
cgit v1.2.3-70-g09d2


From ced918eb748ce30b3aace549fd17540e40ffdca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 Feb 2010 16:47:10 +0000
Subject: i8253: Convert i8253_lock to raw_spinlock

i8253_lock needs to be a real spinlock in preempt-rt, i.e. it can
not be converted to a sleeping lock.

Convert it to raw_spinlock and fix up all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <20100217163751.030764372@linutronix.de>
---
 arch/mips/include/asm/i8253.h     |  2 +-
 arch/mips/kernel/i8253.c          | 14 +++++++-------
 arch/x86/include/asm/i8253.h      |  2 +-
 arch/x86/kernel/apm_32.c          |  4 ++--
 arch/x86/kernel/i8253.c           | 14 +++++++-------
 drivers/block/hd.c                |  4 ++--
 drivers/input/gameport/gameport.c |  4 ++--
 drivers/input/joystick/analog.c   |  4 ++--
 drivers/input/misc/pcspkr.c       |  6 +++---
 sound/drivers/pcsp/pcsp.h         |  2 +-
 sound/drivers/pcsp/pcsp_input.c   |  4 ++--
 sound/drivers/pcsp/pcsp_lib.c     | 12 ++++++------
 12 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
index 032ca73f181..48bb8237299 100644
--- a/arch/mips/include/asm/i8253.h
+++ b/arch/mips/include/asm/i8253.h
@@ -12,7 +12,7 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 extern void setup_pit_timer(void);
 
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index ed5c441615e..94794062a17 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -15,7 +15,7 @@
 #include <asm/io.h>
 #include <asm/time.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 /*
@@ -26,7 +26,7 @@ EXPORT_SYMBOL(i8253_lock);
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 
 	switch(mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -55,7 +55,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
@@ -65,10 +65,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
 	outb(delta >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 
 	return 0;
 }
@@ -137,7 +137,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	static int old_count;
 	static u32 old_jifs;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
@@ -183,7 +183,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	old_count = count;
 	old_jifs = jifs;
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	count = (LATCH - 1) - count;
 
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 1edbf89680f..fc1f579fb96 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,7 +6,7 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 extern struct clock_event_device *global_clock_event;
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0e..c4f9182ca3a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
 #ifdef INIT_TIMER_AFTER_SUSPEND
 	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
 	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
 	udelay(10);
 	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
 }
 
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5..2dfd3159744 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
 #include <asm/hpet.h>
 #include <asm/smp.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 /*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	outb_pit(delta & 0xff , PIT_CH0);	/* LSB */
 	outb_pit(delta >> 8 , PIT_CH0);		/* MSB */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 
 	return 0;
 }
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	int count;
 	u32 jifs;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	old_count = count;
 	old_jifs = jifs;
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	count = (LATCH - 1) - count;
 
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 5116c65c07c..b9868ad0278 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -165,12 +165,12 @@ unsigned long read_timer(void)
 	unsigned long t, flags;
 	int i;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	t = jiffies * 11932;
 	outb_p(0, 0x43);
 	i = inb_p(0x40);
 	i |= inb(0x40) << 8;
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 	return(t - i);
 }
 #endif
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 7e18bcf05a6..46239e47a26 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -59,11 +59,11 @@ static unsigned int get_time_pit(void)
 	unsigned long flags;
 	unsigned int count;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	outb_p(0x00, 0x43);
 	count = inb_p(0x40);
 	count |= inb_p(0x40) << 8;
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	return count;
 }
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 1c0b529c06a..4afe0a3b488 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -146,11 +146,11 @@ static unsigned int get_time_pit(void)
         unsigned long flags;
         unsigned int count;
 
-        spin_lock_irqsave(&i8253_lock, flags);
+        raw_spin_lock_irqsave(&i8253_lock, flags);
         outb_p(0x00, 0x43);
         count = inb_p(0x40);
         count |= inb_p(0x40) << 8;
-        spin_unlock_irqrestore(&i8253_lock, flags);
+        raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
         return count;
 }
diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c
index ea4e1fd1265..f080dd31499 100644
--- a/drivers/input/misc/pcspkr.c
+++ b/drivers/input/misc/pcspkr.c
@@ -30,7 +30,7 @@ MODULE_ALIAS("platform:pcspkr");
 #include <asm/i8253.h>
 #else
 #include <asm/8253pit.h>
-static DEFINE_SPINLOCK(i8253_lock);
+static DEFINE_RAW_SPINLOCK(i8253_lock);
 #endif
 
 static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
@@ -50,7 +50,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c
 	if (value > 20 && value < 32767)
 		count = PIT_TICK_RATE / value;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 
 	if (count) {
 		/* set command for counter 2, 2 byte write */
@@ -65,7 +65,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c
 		outb(inb_p(0x61) & 0xFC, 0x61);
 	}
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	return 0;
 }
diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h
index 1e123077923..4ff6c8cc507 100644
--- a/sound/drivers/pcsp/pcsp.h
+++ b/sound/drivers/pcsp/pcsp.h
@@ -16,7 +16,7 @@
 #include <asm/i8253.h>
 #else
 #include <asm/8253pit.h>
-static DEFINE_SPINLOCK(i8253_lock);
+static DEFINE_RAW_SPINLOCK(i8253_lock);
 #endif
 
 #define PCSP_SOUND_VERSION 0x400	/* read 4.00 */
diff --git a/sound/drivers/pcsp/pcsp_input.c b/sound/drivers/pcsp/pcsp_input.c
index 0444cdeb4be..b5e2b54c260 100644
--- a/sound/drivers/pcsp/pcsp_input.c
+++ b/sound/drivers/pcsp/pcsp_input.c
@@ -21,7 +21,7 @@ static void pcspkr_do_sound(unsigned int count)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 
 	if (count) {
 		/* set command for counter 2, 2 byte write */
@@ -36,7 +36,7 @@ static void pcspkr_do_sound(unsigned int count)
 		outb(inb_p(0x61) & 0xFC, 0x61);
 	}
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
 void pcspkr_stop_sound(void)
diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c
index e1145ac6e90..f6a2e72b8cd 100644
--- a/sound/drivers/pcsp/pcsp_lib.c
+++ b/sound/drivers/pcsp/pcsp_lib.c
@@ -65,7 +65,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip)
 	timer_cnt = val * CUR_DIV() / 256;
 
 	if (timer_cnt && chip->enable) {
-		spin_lock_irqsave(&i8253_lock, flags);
+		raw_spin_lock_irqsave(&i8253_lock, flags);
 		if (!nforce_wa) {
 			outb_p(chip->val61, 0x61);
 			outb_p(timer_cnt, 0x42);
@@ -74,7 +74,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip)
 			outb(chip->val61 ^ 2, 0x61);
 			chip->thalf = 1;
 		}
-		spin_unlock_irqrestore(&i8253_lock, flags);
+		raw_spin_unlock_irqrestore(&i8253_lock, flags);
 	}
 
 	chip->ns_rem = PCSP_PERIOD_NS();
@@ -158,10 +158,10 @@ static int pcsp_start_playing(struct snd_pcsp *chip)
 		return -EIO;
 	}
 
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	chip->val61 = inb(0x61) | 0x03;
 	outb_p(0x92, 0x43);	/* binary, mode 1, LSB only, ch 2 */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 	atomic_set(&chip->timer_active, 1);
 	chip->thalf = 0;
 
@@ -178,11 +178,11 @@ static void pcsp_stop_playing(struct snd_pcsp *chip)
 		return;
 
 	atomic_set(&chip->timer_active, 0);
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	/* restore the timer */
 	outb_p(0xb6, 0x43);	/* binary, mode 3, LSB/MSB, ch 2 */
 	outb(chip->val61 & 0xFC, 0x61);
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4abc14a733f9002c05623db755aaafdd27fa7a91 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 Jan 2010 14:52:23 +0100
Subject: iommu-api: Rename ->{un}map function pointers to ->{un}map_range

The new function pointer names match better with the
top-level functions of the iommu-api which are using them.
Main intention of this change is to make the ->{un}map
pointer names free for two new mapping functions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 4 ++--
 drivers/base/iommu.c        | 4 ++--
 drivers/pci/intel-iommu.c   | 4 ++--
 include/linux/iommu.h       | 8 ++++----
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index adb0ba02570..59cae7c4df5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2515,8 +2515,8 @@ static struct iommu_ops amd_iommu_ops = {
 	.domain_destroy = amd_iommu_domain_destroy,
 	.attach_dev = amd_iommu_attach_device,
 	.detach_dev = amd_iommu_detach_device,
-	.map = amd_iommu_map_range,
-	.unmap = amd_iommu_unmap_range,
+	.map_range = amd_iommu_map_range,
+	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
 	.domain_has_cap = amd_iommu_domain_has_cap,
 };
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index 8ad4ffea692..f4c86c42929 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -83,14 +83,14 @@ EXPORT_SYMBOL_GPL(iommu_detach_device);
 int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
 		    phys_addr_t paddr, size_t size, int prot)
 {
-	return iommu_ops->map(domain, iova, paddr, size, prot);
+	return iommu_ops->map_range(domain, iova, paddr, size, prot);
 }
 EXPORT_SYMBOL_GPL(iommu_map_range);
 
 void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
 		      size_t size)
 {
-	iommu_ops->unmap(domain, iova, size);
+	iommu_ops->unmap_range(domain, iova, size);
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_range);
 
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 417312528dd..a714e3db13c 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3714,8 +3714,8 @@ static struct iommu_ops intel_iommu_ops = {
 	.domain_destroy = intel_iommu_domain_destroy,
 	.attach_dev	= intel_iommu_attach_device,
 	.detach_dev	= intel_iommu_detach_device,
-	.map		= intel_iommu_map_range,
-	.unmap		= intel_iommu_unmap_range,
+	.map_range	= intel_iommu_map_range,
+	.unmap_range	= intel_iommu_unmap_range,
 	.iova_to_phys	= intel_iommu_iova_to_phys,
 	.domain_has_cap = intel_iommu_domain_has_cap,
 };
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3af4ffd591b..0f18f37a650 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -36,10 +36,10 @@ struct iommu_ops {
 	void (*domain_destroy)(struct iommu_domain *domain);
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
-	int (*map)(struct iommu_domain *domain, unsigned long iova,
-		   phys_addr_t paddr, size_t size, int prot);
-	void (*unmap)(struct iommu_domain *domain, unsigned long iova,
-		      size_t size);
+	int (*map_range)(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot);
+	void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,
+			    size_t size);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    unsigned long iova);
 	int (*domain_has_cap)(struct iommu_domain *domain,
-- 
cgit v1.2.3-70-g09d2


From cbb9d729f3433c9c2660b01dc52e6deb89488886 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 15 Jan 2010 14:41:15 +0100
Subject: x86/amd-iommu: Make iommu_map_page and alloc_pte aware of page sizes

This patch changes the old map_size parameter of alloc_pte
to a page_size parameter which can be used more easily to
alloc a pte for intermediate page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h | 28 ++++++++++++++++++
 arch/x86/kernel/amd_iommu.c            | 53 +++++++++++++++++++++-------------
 2 files changed, 61 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ba19ad4c47d..5e8da56755d 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -172,6 +172,34 @@
 				(~((1ULL << (12 + ((lvl) * 9))) - 1)))
 #define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr))
 
+/*
+ * Returns the page table level to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_LEVEL(pagesize) \
+		((__ffs(pagesize) - 12) / 9)
+/*
+ * Returns the number of ptes to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_PTE_COUNT(pagesize) \
+		(1ULL << ((__ffs(pagesize) - 12) % 9))
+
+/*
+ * Aligns a given io-virtual address to a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_ALIGN(address, pagesize) \
+		((address) & ~((pagesize) - 1))
+/*
+ * Creates an IOMMU PTE for an address an a given pagesize
+ * The PTE has no permission bits set
+ * Pagesize is expected to be a power-of-two larger than 4096
+ */
+#define PAGE_SIZE_PTE(address, pagesize)		\
+		(((address) | ((pagesize) - 1)) &	\
+		 (~(pagesize >> 1)) & PM_ADDR_MASK)
+
 #define IOMMU_PTE_P  (1ULL << 0)
 #define IOMMU_PTE_TV (1ULL << 1)
 #define IOMMU_PTE_U  (1ULL << 59)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 59cae7c4df5..41700314f3e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -730,18 +730,22 @@ static bool increase_address_space(struct protection_domain *domain,
 
 static u64 *alloc_pte(struct protection_domain *domain,
 		      unsigned long address,
-		      int end_lvl,
+		      unsigned long page_size,
 		      u64 **pte_page,
 		      gfp_t gfp)
 {
+	int level, end_lvl;
 	u64 *pte, *page;
-	int level;
+
+	BUG_ON(!is_power_of_2(page_size));
 
 	while (address > PM_LEVEL_SIZE(domain->mode))
 		increase_address_space(domain, gfp);
 
-	level =  domain->mode - 1;
-	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	level   = domain->mode - 1;
+	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	address = PAGE_SIZE_ALIGN(address, page_size);
+	end_lvl = PAGE_SIZE_LEVEL(page_size);
 
 	while (level > end_lvl) {
 		if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -751,6 +755,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
 			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
 		}
 
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
 		level -= 1;
 
 		pte = IOMMU_PTE_PAGE(*pte);
@@ -806,31 +814,36 @@ static int iommu_map_page(struct protection_domain *dom,
 			  unsigned long bus_addr,
 			  unsigned long phys_addr,
 			  int prot,
-			  int map_size)
+			  unsigned long page_size)
 {
 	u64 __pte, *pte;
-
-	bus_addr  = PAGE_ALIGN(bus_addr);
-	phys_addr = PAGE_ALIGN(phys_addr);
-
-	BUG_ON(!PM_ALIGNED(map_size, bus_addr));
-	BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+	int i, count;
 
 	if (!(prot & IOMMU_PROT_MASK))
 		return -EINVAL;
 
-	pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
+	bus_addr  = PAGE_ALIGN(bus_addr);
+	phys_addr = PAGE_ALIGN(phys_addr);
+	count     = PAGE_SIZE_PTE_COUNT(page_size);
+	pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
+
+	for (i = 0; i < count; ++i)
+		if (IOMMU_PTE_PRESENT(pte[i]))
+			return -EBUSY;
 
-	if (IOMMU_PTE_PRESENT(*pte))
-		return -EBUSY;
+	if (page_size > PAGE_SIZE) {
+		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
+		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+	} else
+		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
-	__pte = phys_addr | IOMMU_PTE_P;
 	if (prot & IOMMU_PROT_IR)
 		__pte |= IOMMU_PTE_IR;
 	if (prot & IOMMU_PROT_IW)
 		__pte |= IOMMU_PTE_IW;
 
-	*pte = __pte;
+	for (i = 0; i < count; ++i)
+		pte[i] = __pte;
 
 	update_domain(dom);
 
@@ -877,7 +890,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 	for (addr = e->address_start; addr < e->address_end;
 	     addr += PAGE_SIZE) {
 		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
-				     PM_MAP_4k);
+				     PAGE_SIZE);
 		if (ret)
 			return ret;
 		/*
@@ -1005,7 +1018,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 		u64 *pte, *pte_page;
 
 		for (i = 0; i < num_ptes; ++i) {
-			pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
+			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
 					&pte_page, gfp);
 			if (!pte)
 				goto out_free;
@@ -1711,7 +1724,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 
 	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
 	if (!pte) {
-		pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
 				GFP_ATOMIC);
 		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
 	} else
@@ -2457,7 +2470,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
 	paddr &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
+		ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
 		if (ret)
 			return ret;
 
-- 
cgit v1.2.3-70-g09d2


From 24cd772315c19e4d9409d0d21367ec1ebab3149f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 Jan 2010 17:27:39 +0100
Subject: x86/amd-iommu: Make iommu_unmap_page and fetch_pte aware of page
 sizes

This patch extends the functionality of iommu_unmap_page
and fetch_pte to support arbitrary page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  6 +++
 arch/x86/kernel/amd_iommu.c            | 90 +++++++++++++++++++++++++++-------
 2 files changed, 78 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 5e8da56755d..b150c74e0d4 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -200,6 +200,12 @@
 		(((address) | ((pagesize) - 1)) &	\
 		 (~(pagesize >> 1)) & PM_ADDR_MASK)
 
+/*
+ * Takes a PTE value with mode=0x07 and returns the page size it maps
+ */
+#define PTE_PAGE_SIZE(pte) \
+	(1ULL << (1 + ffz(((pte) | 0xfffULL))))
+
 #define IOMMU_PTE_P  (1ULL << 0)
 #define IOMMU_PTE_TV (1ULL << 1)
 #define IOMMU_PTE_U  (1ULL << 59)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 41700314f3e..503d312f9d6 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -776,28 +776,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
  * This function checks if there is a PTE for a given dma address. If
  * there is one, it returns the pointer to it.
  */
-static u64 *fetch_pte(struct protection_domain *domain,
-		      unsigned long address, int map_size)
+static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
 {
 	int level;
 	u64 *pte;
 
-	level =  domain->mode - 1;
-	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	if (address > PM_LEVEL_SIZE(domain->mode))
+		return NULL;
+
+	level   =  domain->mode - 1;
+	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-	while (level > map_size) {
+	while (level > 0) {
+
+		/* Not Present */
 		if (!IOMMU_PTE_PRESENT(*pte))
 			return NULL;
 
+		/* Large PTE */
+		if (PM_PTE_LEVEL(*pte) == 0x07) {
+			unsigned long pte_mask, __pte;
+
+			/*
+			 * If we have a series of large PTEs, make
+			 * sure to return a pointer to the first one.
+			 */
+			pte_mask = PTE_PAGE_SIZE(*pte);
+			pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
+			__pte    = ((unsigned long)pte) & pte_mask;
+
+			return (u64 *)__pte;
+		}
+
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
 		level -= 1;
 
+		/* Walk to the next level */
 		pte = IOMMU_PTE_PAGE(*pte);
 		pte = &pte[PM_LEVEL_INDEX(level, address)];
-
-		if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
-			pte = NULL;
-			break;
-		}
 	}
 
 	return pte;
@@ -850,13 +869,48 @@ static int iommu_map_page(struct protection_domain *dom,
 	return 0;
 }
 
-static void iommu_unmap_page(struct protection_domain *dom,
-			     unsigned long bus_addr, int map_size)
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
+				      unsigned long bus_addr,
+				      unsigned long page_size)
 {
-	u64 *pte = fetch_pte(dom, bus_addr, map_size);
+	unsigned long long unmap_size, unmapped;
+	u64 *pte;
+
+	BUG_ON(!is_power_of_2(page_size));
+
+	unmapped = 0;
+
+	while (unmapped < page_size) {
+
+		pte = fetch_pte(dom, bus_addr);
+
+		if (!pte) {
+			/*
+			 * No PTE for this address
+			 * move forward in 4kb steps
+			 */
+			unmap_size = PAGE_SIZE;
+		} else if (PM_PTE_LEVEL(*pte) == 0) {
+			/* 4kb PTE found for this address */
+			unmap_size = PAGE_SIZE;
+			*pte       = 0ULL;
+		} else {
+			int count, i;
+
+			/* Large PTE found which maps this address */
+			unmap_size = PTE_PAGE_SIZE(*pte);
+			count      = PAGE_SIZE_PTE_COUNT(unmap_size);
+			for (i = 0; i < count; i++)
+				pte[i] = 0ULL;
+		}
+
+		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+		unmapped += unmap_size;
+	}
+
+	BUG_ON(!is_power_of_2(unmapped));
 
-	if (pte)
-		*pte = 0;
+	return unmapped;
 }
 
 /*
@@ -1054,7 +1108,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 	for (i = dma_dom->aperture[index]->offset;
 	     i < dma_dom->aperture_size;
 	     i += PAGE_SIZE) {
-		u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
+		u64 *pte = fetch_pte(&dma_dom->domain, i);
 		if (!pte || !IOMMU_PTE_PRESENT(*pte))
 			continue;
 
@@ -2491,7 +2545,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
 	iova  &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		iommu_unmap_page(domain, iova, PM_MAP_4k);
+		iommu_unmap_page(domain, iova, PAGE_SIZE);
 		iova  += PAGE_SIZE;
 	}
 
@@ -2506,7 +2560,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	phys_addr_t paddr;
 	u64 *pte;
 
-	pte = fetch_pte(domain, iova, PM_MAP_4k);
+	pte = fetch_pte(domain, iova);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From f03152bb7d0a74f409ad63ed36916444a7493d72 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 16:15:24 +0100
Subject: x86/amd-iommu: Make amd_iommu_iova_to_phys aware of multiple page
 sizes

This patch extends the amd_iommu_iova_to_phys() function to
handle different page sizes correctly. It doesn't use
fetch_pte() anymore because we don't know (or care about)
the page_size used for mapping the given iova.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 503d312f9d6..52e44af1570 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2556,17 +2556,22 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 					  unsigned long iova)
 {
 	struct protection_domain *domain = dom->priv;
-	unsigned long offset = iova & ~PAGE_MASK;
+	unsigned long offset_mask;
 	phys_addr_t paddr;
-	u64 *pte;
+	u64 *pte, __pte;
 
 	pte = fetch_pte(domain, iova);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
 
-	paddr  = *pte & IOMMU_PAGE_MASK;
-	paddr |= offset;
+	if (PM_PTE_LEVEL(*pte) == 0)
+		offset_mask = PAGE_SIZE - 1;
+	else
+		offset_mask = PTE_PAGE_SIZE(*pte) - 1;
+
+	__pte = *pte & PM_ADDR_MASK;
+	paddr = (__pte & ~offset_mask) | (iova & offset_mask);
 
 	return paddr;
 }
-- 
cgit v1.2.3-70-g09d2


From 468e2366cdb80cf8a691b8bc212260cfbdbd518e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 16:37:36 +0100
Subject: x86/amd-iommu: Implement ->{un}map callbacks for iommu-api

This patch implements the new callbacks for the IOMMU-API
with functions that can handle different page sizes in the
IOMMU page table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 52e44af1570..0e068c9ca5f 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2552,6 +2552,33 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
 	iommu_flush_tlb_pde(domain);
 }
 
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+			 phys_addr_t paddr, int gfp_order, int iommu_prot)
+{
+	unsigned long page_size = 0x1000UL << gfp_order;
+	struct protection_domain *domain = dom->priv;
+	int prot = 0;
+
+	if (iommu_prot & IOMMU_READ)
+		prot |= IOMMU_PROT_IR;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= IOMMU_PROT_IW;
+
+	return iommu_map_page(domain, iova, paddr, prot, page_size);
+}
+
+static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+			   int gfp_order)
+{
+	struct protection_domain *domain = dom->priv;
+	unsigned long page_size, unmap_size;
+
+	page_size  = 0x1000UL << gfp_order;
+	unmap_size = iommu_unmap_page(domain, iova, page_size);
+
+	return get_order(unmap_size);
+}
+
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 					  unsigned long iova)
 {
@@ -2587,6 +2614,8 @@ static struct iommu_ops amd_iommu_ops = {
 	.domain_destroy = amd_iommu_domain_destroy,
 	.attach_dev = amd_iommu_attach_device,
 	.detach_dev = amd_iommu_detach_device,
+	.map = amd_iommu_map,
+	.unmap = amd_iommu_unmap,
 	.map_range = amd_iommu_map_range,
 	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
-- 
cgit v1.2.3-70-g09d2


From 12c7389abe5786349d3ea6da1961cf78d0c1c7cd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 11:50:28 +0100
Subject: iommu-api: Remove iommu_{un}map_range functions

These functions are not longer used and can be removed
savely. There functionality is now provided by the
iommu_{un}map functions which are also capable of multiple
page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 48 ---------------------------------------------
 drivers/base/iommu.c        | 26 ++----------------------
 include/linux/iommu.h       | 20 -------------------
 3 files changed, 2 insertions(+), 92 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0e068c9ca5f..d8da9988edd 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2506,52 +2506,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 	return ret;
 }
 
-static int amd_iommu_map_range(struct iommu_domain *dom,
-			       unsigned long iova, phys_addr_t paddr,
-			       size_t size, int iommu_prot)
-{
-	struct protection_domain *domain = dom->priv;
-	unsigned long i,  npages = iommu_num_pages(paddr, size, PAGE_SIZE);
-	int prot = 0;
-	int ret;
-
-	if (iommu_prot & IOMMU_READ)
-		prot |= IOMMU_PROT_IR;
-	if (iommu_prot & IOMMU_WRITE)
-		prot |= IOMMU_PROT_IW;
-
-	iova  &= PAGE_MASK;
-	paddr &= PAGE_MASK;
-
-	for (i = 0; i < npages; ++i) {
-		ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
-		if (ret)
-			return ret;
-
-		iova  += PAGE_SIZE;
-		paddr += PAGE_SIZE;
-	}
-
-	return 0;
-}
-
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
-				  unsigned long iova, size_t size)
-{
-
-	struct protection_domain *domain = dom->priv;
-	unsigned long i,  npages = iommu_num_pages(iova, size, PAGE_SIZE);
-
-	iova  &= PAGE_MASK;
-
-	for (i = 0; i < npages; ++i) {
-		iommu_unmap_page(domain, iova, PAGE_SIZE);
-		iova  += PAGE_SIZE;
-	}
-
-	iommu_flush_tlb_pde(domain);
-}
-
 static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
 			 phys_addr_t paddr, int gfp_order, int iommu_prot)
 {
@@ -2616,8 +2570,6 @@ static struct iommu_ops amd_iommu_ops = {
 	.detach_dev = amd_iommu_detach_device,
 	.map = amd_iommu_map,
 	.unmap = amd_iommu_unmap,
-	.map_range = amd_iommu_map_range,
-	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
 	.domain_has_cap = amd_iommu_domain_has_cap,
 };
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index 55d37e4609e..6e6b6a11b3c 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -80,20 +80,6 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_detach_device);
 
-int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
-		    phys_addr_t paddr, size_t size, int prot)
-{
-	return iommu_ops->map_range(domain, iova, paddr, size, prot);
-}
-EXPORT_SYMBOL_GPL(iommu_map_range);
-
-void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
-		      size_t size)
-{
-	iommu_ops->unmap_range(domain, iova, size);
-}
-EXPORT_SYMBOL_GPL(iommu_unmap_range);
-
 phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 			       unsigned long iova)
 {
@@ -119,10 +105,7 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 	BUG_ON((iova | paddr) & invalid_mask);
 
-	if (iommu_ops->map)
-		return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
-
-	return iommu_ops->map_range(domain, iova, paddr, size, prot);
+	return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
 }
 EXPORT_SYMBOL_GPL(iommu_map);
 
@@ -136,11 +119,6 @@ int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)
 
 	BUG_ON(iova & invalid_mask);
 
-	if (iommu_ops->unmap)
-		return iommu_ops->unmap(domain, iova, gfp_order);
-
-	iommu_ops->unmap_range(domain, iova, size);
-
-	return gfp_order;
+	return iommu_ops->unmap(domain, iova, gfp_order);
 }
 EXPORT_SYMBOL_GPL(iommu_unmap);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5a7a3d888da..be22ad83689 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -40,10 +40,6 @@ struct iommu_ops {
 		   phys_addr_t paddr, int gfp_order, int prot);
 	int (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     int gfp_order);
-	int (*map_range)(struct iommu_domain *domain, unsigned long iova,
-			 phys_addr_t paddr, size_t size, int prot);
-	void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,
-			    size_t size);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    unsigned long iova);
 	int (*domain_has_cap)(struct iommu_domain *domain,
@@ -60,10 +56,6 @@ extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
-extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
-			   phys_addr_t paddr, size_t size, int prot);
-extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
-			      size_t size);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		     phys_addr_t paddr, int gfp_order, int prot);
 extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
@@ -104,18 +96,6 @@ static inline void iommu_detach_device(struct iommu_domain *domain,
 {
 }
 
-static inline int iommu_map_range(struct iommu_domain *domain,
-				  unsigned long iova, phys_addr_t paddr,
-				  size_t size, int prot)
-{
-	return -ENODEV;
-}
-
-static inline void iommu_unmap_range(struct iommu_domain *domain,
-				     unsigned long iova, size_t size)
-{
-}
-
 static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
 			    phys_addr_t paddr, int gfp_order, int prot)
 {
-- 
cgit v1.2.3-70-g09d2


From ca037701a025334e724e5c61b3b1082940c8b981 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Mar 2010 19:52:12 +0100
Subject: perf, x86: Add PEBS infrastructure

This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.

This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.

With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.

This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).

It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 223 +++++-------
 arch/x86/kernel/cpu/perf_event_intel.c    | 150 ++------
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 557 ++++++++++++++++++++++++++++++
 include/linux/perf_event.h                |   3 +-
 4 files changed, 671 insertions(+), 262 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_ds.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1d665a0b202..0c03d5c1671 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,45 +31,6 @@
 
 static u64 perf_event_mask __read_mostly;
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS	4
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE		24
-
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
-
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
-
-
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
 struct event_constraint {
 	union {
 		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -88,17 +49,29 @@ struct amd_nb {
 };
 
 struct cpu_hw_events {
+	/*
+	 * Generic x86 PMC bits
+	 */
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	int			enabled;
-	struct debug_store	*ds;
 
 	int			n_events;
 	int			n_added;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	struct debug_store	*ds;
+	u64			pebs_enabled;
+
+	/*
+	 * AMD specific bits
+	 */
 	struct amd_nb		*amd_nb;
 };
 
@@ -112,12 +85,24 @@ struct cpu_hw_events {
 #define EVENT_CONSTRAINT(c, n, m)	\
 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
 
+/*
+ * Constraint on the Event code.
+ */
 #define INTEL_EVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
 
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ */
 #define FIXED_EVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
 
+/*
+ * Constraint on the Event code + UMask
+ */
+#define PEBS_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
 #define EVENT_CONSTRAINT_END		\
 	EVENT_CONSTRAINT(0, 0, 0)
 
@@ -128,6 +113,9 @@ struct cpu_hw_events {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	/*
+	 * Generic x86 PMC bits
+	 */
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *);
@@ -146,10 +134,6 @@ struct x86_pmu {
 	u64		event_mask;
 	int		apic;
 	u64		max_period;
-	u64		intel_ctrl;
-	void		(*enable_bts)(u64 config);
-	void		(*disable_bts)(void);
-
 	struct event_constraint *
 			(*get_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
@@ -162,6 +146,19 @@ struct x86_pmu {
 	void		(*cpu_starting)(int cpu);
 	void		(*cpu_dying)(int cpu);
 	void		(*cpu_dead)(int cpu);
+
+	/*
+	 * Intel Arch Perfmon v2+
+	 */
+	u64		intel_ctrl;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	int		bts, pebs;
+	int		pebs_record_size;
+	void		(*drain_pebs)(struct pt_regs *regs);
+	struct event_constraint *pebs_constraints;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -293,110 +290,14 @@ static void release_pmc_hardware(void)
 #endif
 }
 
-static inline bool bts_available(void)
-{
-	return x86_pmu.enable_bts != NULL;
-}
-
-static void init_debug_store_on_cpu(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(unsigned long)ds),
-		     (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static void fini_debug_store_on_cpu(int cpu)
-{
-	if (!per_cpu(cpu_hw_events, cpu).ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
-	int cpu;
-
-	if (!bts_available())
-		return;
-
-	get_online_cpus();
-
-	for_each_online_cpu(cpu)
-		fini_debug_store_on_cpu(cpu);
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-		if (!ds)
-			continue;
-
-		per_cpu(cpu_hw_events, cpu).ds = NULL;
-
-		kfree((void *)(unsigned long)ds->bts_buffer_base);
-		kfree(ds);
-	}
-
-	put_online_cpus();
-}
-
-static int reserve_bts_hardware(void)
-{
-	int cpu, err = 0;
-
-	if (!bts_available())
-		return 0;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds;
-		void *buffer;
-
-		err = -ENOMEM;
-		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-		if (unlikely(!buffer))
-			break;
-
-		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
-			break;
-		}
-
-		ds->bts_buffer_base = (u64)(unsigned long)buffer;
-		ds->bts_index = ds->bts_buffer_base;
-		ds->bts_absolute_maximum =
-			ds->bts_buffer_base + BTS_BUFFER_SIZE;
-		ds->bts_interrupt_threshold =
-			ds->bts_absolute_maximum - BTS_OVFL_TH;
-
-		per_cpu(cpu_hw_events, cpu).ds = ds;
-		err = 0;
-	}
-
-	if (err)
-		release_bts_hardware();
-	else {
-		for_each_online_cpu(cpu)
-			init_debug_store_on_cpu(cpu);
-	}
-
-	put_online_cpus();
-
-	return err;
-}
+static int reserve_ds_buffers(void);
+static void release_ds_buffers(void);
 
 static void hw_perf_event_destroy(struct perf_event *event)
 {
 	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
-		release_bts_hardware();
+		release_ds_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -459,7 +360,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
 			else
-				err = reserve_bts_hardware();
+				err = reserve_ds_buffers();
 		}
 		if (!err)
 			atomic_inc(&active_events);
@@ -537,7 +438,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
 	    (hwc->sample_period == 1)) {
 		/* BTS is not supported by this architecture. */
-		if (!bts_available())
+		if (!x86_pmu.bts)
 			return -EOPNOTSUPP;
 
 		/* BTS is currently only allowed for user-mode. */
@@ -995,6 +896,7 @@ static void x86_pmu_unthrottle(struct perf_event *event)
 void perf_event_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+	u64 pebs;
 	struct cpu_hw_events *cpuc;
 	unsigned long flags;
 	int cpu, idx;
@@ -1012,12 +914,14 @@ void perf_event_print_debug(void)
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
 
 		pr_info("\n");
 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
 	}
 	pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
@@ -1333,6 +1237,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
 
 static int __cpuinit
@@ -1464,6 +1369,32 @@ static const struct pmu pmu = {
 	.unthrottle	= x86_pmu_unthrottle,
 };
 
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+	struct cpu_hw_events *fake_cpuc;
+	struct event_constraint *c;
+	int ret = 0;
+
+	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+	if (!fake_cpuc)
+		return -ENOMEM;
+
+	c = x86_pmu.get_event_constraints(fake_cpuc, event);
+
+	if (!c || !c->weight)
+		ret = -ENOSPC;
+
+	if (x86_pmu.put_event_constraints)
+		x86_pmu.put_event_constraints(fake_cpuc, event);
+
+	kfree(fake_cpuc);
+
+	return ret;
+}
+
 /*
  * validate a single event group
  *
@@ -1529,6 +1460,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 
 		if (event->group_leader != event)
 			err = validate_group(event);
+		else
+			err = validate_event(event);
 
 		event->pmu = tmp;
 	}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 84bfde64a33..11446412e4c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -470,42 +470,6 @@ static u64 intel_pmu_raw_event(u64 hw_event)
 	return hw_event & CORE_EVNTSEL_MASK;
 }
 
-static void intel_pmu_enable_bts(u64 config)
-{
-	unsigned long debugctlmsr;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	unsigned long debugctlmsr;
-
-	if (!cpuc->ds)
-		return;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
-	update_debugctlmsr(debugctlmsr);
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,6 +478,8 @@ static void intel_pmu_disable_all(void)
 
 	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
+
+	intel_pmu_pebs_disable_all();
 }
 
 static void intel_pmu_enable_all(void)
@@ -531,6 +497,8 @@ static void intel_pmu_enable_all(void)
 
 		intel_pmu_enable_bts(event->hw.config);
 	}
+
+	intel_pmu_pebs_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
@@ -547,8 +515,7 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -560,68 +527,7 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void intel_pmu_drain_bts_buffer(void)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct bts_record {
-		u64	from;
-		u64	to;
-		u64	flags;
-	};
-	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *top;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_sample_data data;
-	struct pt_regs regs;
-
-	if (!event)
-		return;
-
-	if (!ds)
-		return;
-
-	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top = (struct bts_record *)(unsigned long)ds->bts_index;
-
-	if (top <= at)
-		return;
-
-	ds->bts_index = ds->bts_buffer_base;
-
-	perf_sample_data_init(&data, 0);
-
-	data.period	= event->hw.last_period;
-	regs.ip		= 0;
-
-	/*
-	 * Prepare a generic sample, i.e. fill in the invariant fields.
-	 * We will overwrite the from and to address before we output
-	 * the sample.
-	 */
-	perf_prepare_sample(&header, &data, event, &regs);
-
-	if (perf_output_begin(&handle, event,
-			      header.size * (top - at), 1, 1))
-		return;
-
-	for (; at < top; at++) {
-		data.ip		= at->from;
-		data.addr	= at->to;
-
-		perf_output_sample(&handle, &header, &data, event);
-	}
-
-	perf_output_end(&handle);
-
-	/* There's new data available. */
-	event->hw.interrupts++;
-	event->pending_kill = POLL_IN;
-}
-
-static inline void
-intel_pmu_disable_event(struct perf_event *event)
+static void intel_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
@@ -637,10 +543,12 @@ intel_pmu_disable_event(struct perf_event *event)
 	}
 
 	x86_pmu_disable_event(event);
+
+	if (unlikely(event->attr.precise))
+		intel_pmu_pebs_disable(hwc);
 }
 
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -689,6 +597,9 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		return;
 	}
 
+	if (unlikely(event->attr.precise))
+		intel_pmu_pebs_enable(hwc);
+
 	__x86_pmu_enable_event(hwc);
 }
 
@@ -762,6 +673,13 @@ again:
 
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
+
+	/*
+	 * PEBS overflow sets bit 62 in the global status register
+	 */
+	if (__test_and_clear_bit(62, (unsigned long *)&status))
+		x86_pmu.drain_pebs(regs);
+
 	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_event *event = cpuc->events[bit];
 
@@ -791,22 +709,18 @@ done:
 	return 1;
 }
 
-static struct event_constraint bts_constraint =
-	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
-
 static struct event_constraint *
-intel_special_constraints(struct perf_event *event)
+intel_bts_constraints(struct perf_event *event)
 {
-	unsigned int hw_event;
-
-	hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int hw_event, bts_event;
 
-	if (unlikely((hw_event ==
-		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
-		     (event->hw.sample_period == 1))) {
+	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
 
+	if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
 		return &bts_constraint;
-	}
+
 	return NULL;
 }
 
@@ -815,7 +729,11 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 {
 	struct event_constraint *c;
 
-	c = intel_special_constraints(event);
+	c = intel_bts_constraints(event);
+	if (c)
+		return c;
+
+	c = intel_pebs_constraints(event);
 	if (c)
 		return c;
 
@@ -864,8 +782,6 @@ static __initconst struct x86_pmu intel_pmu = {
 	 * the generic event period:
 	 */
 	.max_period		= (1ULL << 31) - 1,
-	.enable_bts		= intel_pmu_enable_bts,
-	.disable_bts		= intel_pmu_disable_bts,
 	.get_event_constraints	= intel_get_event_constraints,
 
 	.cpu_starting		= init_debug_store_on_cpu,
@@ -915,6 +831,8 @@ static __init int intel_pmu_init(void)
 	if (version > 1)
 		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
 
+	intel_ds_init();
+
 	/*
 	 * Install the hw-cache-events table:
 	 */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 00000000000..0d994ef213b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,557 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+	u32 flags, ip;
+	u32 ax, bc, cx, dx;
+	u32 si, di, bp, sp;
+};
+
+ */
+
+struct pebs_record_core {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+};
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR			(1 << 6)
+#define X86_DEBUGCTL_BTS		(1 << 7)
+#define X86_DEBUGCTL_BTINT		(1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+static void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_events, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_ds_buffers(void)
+{
+	int cpu;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+		if (!ds)
+			continue;
+
+		per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+		kfree((void *)(unsigned long)ds->pebs_buffer_base);
+		kfree((void *)(unsigned long)ds->bts_buffer_base);
+		kfree(ds);
+	}
+
+	put_online_cpus();
+}
+
+static int reserve_ds_buffers(void)
+{
+	int cpu, err = 0;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return 0;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds;
+		void *buffer;
+		int max, thresh;
+
+		err = -ENOMEM;
+		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+		if (unlikely(!ds)) {
+			kfree(buffer);
+			break;
+		}
+		per_cpu(cpu_hw_events, cpu).ds = ds;
+
+		if (x86_pmu.bts) {
+			buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+			if (unlikely(!buffer))
+				break;
+
+			max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+			thresh = max / 16;
+
+			ds->bts_buffer_base = (u64)(unsigned long)buffer;
+			ds->bts_index = ds->bts_buffer_base;
+			ds->bts_absolute_maximum = ds->bts_buffer_base +
+				max * BTS_RECORD_SIZE;
+			ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+				thresh * BTS_RECORD_SIZE;
+		}
+
+		if (x86_pmu.pebs) {
+			buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+			if (unlikely(!buffer))
+				break;
+
+			max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+			ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+			ds->pebs_index = ds->pebs_buffer_base;
+			ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+				max * x86_pmu.pebs_record_size;
+			/*
+			 * Always use single record PEBS
+			 */
+			ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+				x86_pmu.pebs_record_size;
+		}
+
+		err = 0;
+	}
+
+	if (err)
+		release_ds_buffers();
+	else {
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+
+	return err;
+}
+
+/*
+ * BTS
+ */
+
+static struct event_constraint bts_constraint =
+	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+
+static void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= X86_DEBUGCTL_TR;
+	debugctlmsr |= X86_DEBUGCTL_BTS;
+	debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_drain_bts_buffer(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!event)
+		return;
+
+	if (!ds)
+		return;
+
+	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+	if (top <= at)
+		return;
+
+	ds->bts_index = ds->bts_buffer_base;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+	regs.ip     = 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, event, &regs);
+
+	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+		return;
+
+	for (; at < top; at++) {
+		data.ip		= at->from;
+		data.addr	= at->to;
+
+		perf_output_sample(&handle, &header, &data, event);
+	}
+
+	perf_output_end(&handle);
+
+	/* There's new data available. */
+	event->hw.interrupts++;
+	event->pending_kill = POLL_IN;
+}
+
+/*
+ * PEBS
+ */
+
+static struct event_constraint intel_core_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+	PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+	PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+	PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+	PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+	PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+	PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint *
+intel_pebs_constraints(struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (!event->attr.precise)
+		return NULL;
+
+	if (x86_pmu.pebs_constraints) {
+		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &emptyconstraint;
+}
+
+static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = cpuc->pebs_enabled;
+
+	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+	val |= 1ULL << hwc->idx;
+	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+}
+
+static void intel_pmu_pebs_disable(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = cpuc->pebs_enabled;
+
+	val &= ~(1ULL << hwc->idx);
+	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+
+	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+static void intel_pmu_pebs_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+static void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_save_and_restart(struct perf_event *event);
+static void intel_pmu_disable_event(struct perf_event *event);
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+	struct pebs_record_core *at, *top;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+	int n;
+
+	if (!event || !ds || !x86_pmu.pebs)
+		return;
+
+	intel_pmu_pebs_disable_all();
+
+	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+	if (top <= at)
+		goto out;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!intel_pmu_save_and_restart(event))
+		goto out;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+
+	n = top - at;
+
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > 1);
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	regs = *iregs;
+	regs.ip = at->ip;
+	regs.bp = at->bp;
+	regs.sp = at->sp;
+
+	if (perf_event_overflow(event, 1, &data, &regs))
+		intel_pmu_disable_event(event);
+
+out:
+	intel_pmu_pebs_enable_all();
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct pebs_record_nhm *at, *top;
+	struct perf_sample_data data;
+	struct perf_event *event = NULL;
+	struct pt_regs regs;
+	int bit, n;
+
+	if (!ds || !x86_pmu.pebs)
+		return;
+
+	intel_pmu_pebs_disable_all();
+
+	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+	if (top <= at)
+		goto out;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	n = top - at;
+
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+
+	for ( ; at < top; at++) {
+		for_each_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+			if (!cpuc->events[bit]->attr.precise)
+				continue;
+
+			event = cpuc->events[bit];
+		}
+
+		if (!event)
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		perf_sample_data_init(&data, 0);
+		data.period = event->hw.last_period;
+
+		/*
+		 * See the comment in intel_pmu_drain_pebs_core()
+		 */
+		regs = *iregs;
+		regs.ip = at->ip;
+		regs.bp = at->bp;
+		regs.sp = at->sp;
+
+		if (perf_event_overflow(event, 1, &data, &regs))
+			intel_pmu_disable_event(event);
+	}
+out:
+	intel_pmu_pebs_enable_all();
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+static void intel_ds_init(void)
+{
+	/*
+	 * No support for 32bit formats
+	 */
+	if (!boot_cpu_has(X86_FEATURE_DTES64))
+		return;
+
+	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+	if (x86_pmu.pebs) {
+		int format = 0;
+
+		if (x86_pmu.version > 1) {
+			u64 capabilities;
+			/*
+			 * v2+ has a PEBS format field
+			 */
+			rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+			format = (capabilities >> 8) & 0xf;
+		}
+
+		switch (format) {
+		case 0:
+			printk(KERN_CONT "PEBS v0, ");
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+			x86_pmu.pebs_constraints = intel_core_pebs_events;
+			break;
+
+		case 1:
+			printk(KERN_CONT "PEBS v1, ");
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
+			break;
+
+		default:
+			printk(KERN_CONT "PEBS unknown format: %d, ", format);
+			x86_pmu.pebs = 0;
+			break;
+		}
+	}
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static int reseve_ds_buffers(void)
+{
+	return 0;
+}
+
+static void release_ds_buffers(void)
+{
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 80acbf3d5de..42307b50c78 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -203,8 +203,9 @@ struct perf_event_attr {
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
 				watermark      :  1, /* wakeup_watermark      */
+				precise        :  1, /* OoO invariant counter */
 
-				__reserved_1   : 49;
+				__reserved_1   : 48;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
-- 
cgit v1.2.3-70-g09d2


From caff2befffe899e63df5cc760b7ed01cfd902685 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Mar 2010 12:02:30 +0100
Subject: perf, x86: Implement simple LBR support

Implement simple suport Intel Last-Branch-Record, it supports all
hardware that implements FREEZE_LBRS_ON_PMI, but does not (yet) implement
the LBR config register.

The Intel LBR is a FIFO of From,To addresses describing the last few
branches the hardware took.

This patch does not add perf interface to the LBR, but merely provides an
interface for internal use.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.544191154@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c           |  18 +++
 arch/x86/kernel/cpu/perf_event_intel.c     |  13 ++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 228 +++++++++++++++++++++++++++++
 include/linux/perf_event.h                 |  11 ++
 4 files changed, 270 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_lbr.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0c03d5c1671..1badff6b6b2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -48,6 +48,8 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 
+#define MAX_LBR_ENTRIES		16
+
 struct cpu_hw_events {
 	/*
 	 * Generic x86 PMC bits
@@ -69,6 +71,14 @@ struct cpu_hw_events {
 	struct debug_store	*ds;
 	u64			pebs_enabled;
 
+	/*
+	 * Intel LBR bits
+	 */
+	int				lbr_users;
+	void				*lbr_context;
+	struct perf_branch_stack	lbr_stack;
+	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
+
 	/*
 	 * AMD specific bits
 	 */
@@ -159,6 +169,13 @@ struct x86_pmu {
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
+
+	/*
+	 * Intel LBR
+	 */
+	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+	int		lbr_nr;			   /* hardware stack size */
+	int		lbr_format;		   /* hardware format     */
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -1237,6 +1254,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_intel_lbr.c"
 #include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 11446412e4c..44f6ed42a93 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -480,6 +480,7 @@ static void intel_pmu_disable_all(void)
 		intel_pmu_disable_bts();
 
 	intel_pmu_pebs_disable_all();
+	intel_pmu_lbr_disable_all();
 }
 
 static void intel_pmu_enable_all(void)
@@ -499,6 +500,7 @@ static void intel_pmu_enable_all(void)
 	}
 
 	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
@@ -674,6 +676,8 @@ again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 
+	intel_pmu_lbr_read();
+
 	/*
 	 * PEBS overflow sets bit 62 in the global status register
 	 */
@@ -848,6 +852,8 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_core();
+
 		x86_pmu.event_constraints = intel_core2_event_constraints;
 		pr_cont("Core2 events, ");
 		break;
@@ -857,13 +863,18 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_nhm();
+
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
+
 	case 28: /* Atom */
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_atom();
+
 		x86_pmu.event_constraints = intel_gen_event_constraints;
 		pr_cont("Atom events, ");
 		break;
@@ -873,6 +884,8 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_nhm();
+
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
 		pr_cont("Westmere events, ");
 		break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 00000000000..ea3e99ed82c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,228 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+};
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+#define X86_DEBUGCTL_LBR               		(1 << 0)
+#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI		(1 << 11)
+
+static void __intel_pmu_lbr_enable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl |= (X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl &= ~(X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++)
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+		wrmsrl(x86_pmu.lbr_to   + i, 0);
+	}
+}
+
+static void intel_pmu_lbr_reset(void)
+{
+	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_reset_32();
+	else
+		intel_pmu_lbr_reset_64();
+}
+
+static void intel_pmu_lbr_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	WARN_ON(cpuc->enabled);
+
+	/*
+	 * Reset the LBR stack if this is the first LBR user or
+	 * we changed task context so as to avoid data leaks.
+	 */
+
+	if (!cpuc->lbr_users ||
+	    (event->ctx->task && cpuc->lbr_context != event->ctx)) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = event->ctx;
+	}
+
+	cpuc->lbr_users++;
+}
+
+static void intel_pmu_lbr_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	cpuc->lbr_users--;
+
+	BUG_ON(cpuc->lbr_users < 0);
+	WARN_ON(cpuc->enabled);
+}
+
+static void intel_pmu_lbr_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_enable();
+}
+
+static void intel_pmu_lbr_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
+}
+
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+
+	return tos;
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		union {
+			struct {
+				u32 from;
+				u32 to;
+			};
+			u64     lbr;
+		} msr_lastbranch;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+		cpuc->lbr_entries[i].from  = msr_lastbranch.from;
+		cpuc->lbr_entries[i].to    = msr_lastbranch.to;
+		cpuc->lbr_entries[i].flags = 0;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		u64 from, to, flags = 0;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+		if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) {
+			flags = !!(from & LBR_FROM_FLAG_MISPRED);
+			from = (u64)((((s64)from) << 1) >> 1);
+		}
+
+		cpuc->lbr_entries[i].from  = from;
+		cpuc->lbr_entries[i].to    = to;
+		cpuc->lbr_entries[i].flags = flags;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+static void intel_pmu_lbr_read(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!cpuc->lbr_users)
+		return;
+
+	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_read_32(cpuc);
+	else
+		intel_pmu_lbr_read_64(cpuc);
+}
+
+static int intel_pmu_lbr_format(void)
+{
+	u64 capabilities;
+
+	rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+	return capabilities & 0x1f;
+}
+
+static void intel_pmu_lbr_init_core(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr     = 4;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x40;
+	x86_pmu.lbr_to     = 0x60;
+}
+
+static void intel_pmu_lbr_init_nhm(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr     = 16;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x680;
+	x86_pmu.lbr_to     = 0x6c0;
+}
+
+static void intel_pmu_lbr_init_atom(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x40;
+	x86_pmu.lbr_to     = 0x60;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 42307b50c78..ab4fd9ede26 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,6 +467,17 @@ struct perf_raw_record {
 	void				*data;
 };
 
+struct perf_branch_entry {
+	__u64				from;
+	__u64				to;
+	__u64				flags;
+};
+
+struct perf_branch_stack {
+	__u64				nr;
+	struct perf_branch_entry	entries[0];
+};
+
 struct task_struct;
 
 /**
-- 
cgit v1.2.3-70-g09d2


From ef21f683a045a79b6aa86ad81e5fdfc0d5ddd250 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Mar 2010 13:12:23 +0100
Subject: perf, x86: use LBR for PEBS IP+1 fixup

Use the LBR to fix up the PEBS IP+1 issue.

As said, PEBS reports the next instruction, here we use the LBR to find
the last branch and from that construct the actual IP. If the IP matches
the LBR-TO, we use LBR-FROM, otherwise we use the LBR-TO address as the
beginning of the last basic block and decode forward.

Once we find a match to the current IP, we use the previous location.

This patch introduces a new ABI element: PERF_RECORD_MISC_EXACT, which
conveys that the reported IP (PERF_SAMPLE_IP) is the exact instruction
that caused the event (barring CPU errata).

The fixup can fail due to various reasons:

 1) LBR contains invalid data (quite possible)
 2) part of the basic block got paged out
 3) the reported IP isn't part of the basic block (see 1)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.619375431@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h         | 19 +++++++
 arch/x86/kernel/cpu/perf_event.c          | 70 +++++++++++++-------------
 arch/x86/kernel/cpu/perf_event_intel.c    |  4 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 84 ++++++++++++++++++++++++++++++-
 include/linux/perf_event.h                |  6 +++
 5 files changed, 144 insertions(+), 39 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index db6109a885a..a9038c95161 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -136,6 +136,25 @@ extern void perf_events_lapic_init(void);
 
 #define PERF_EVENT_INDEX_OFFSET			0
 
+/*
+ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
+ * This flag is otherwise unused and ABI specified to be 0, so nobody should
+ * care what we do with it.
+ */
+#define PERF_EFLAGS_EXACT	(1UL << 3)
+
+#define perf_misc_flags(regs)				\
+({	int misc = 0;					\
+	if (user_mode(regs))				\
+		misc |= PERF_RECORD_MISC_USER;		\
+	else						\
+		misc |= PERF_RECORD_MISC_KERNEL;	\
+	if (regs->flags & PERF_EFLAGS_EXACT)		\
+		misc |= PERF_RECORD_MISC_EXACT;		\
+	misc; })
+
+#define perf_instruction_pointer(regs)	((regs)->ip)
+
 #else
 static inline void init_hw_perf_events(void)		{ }
 static inline void perf_events_lapic_init(void)	{ }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1badff6b6b2..5cb4e8dcee4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -29,6 +29,41 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	int type = in_nmi() ? KM_NMI : KM_IRQ0;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
+	int ret;
+
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
+
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		map = kmap_atomic(page, type);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map, type);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
+
 static u64 perf_event_mask __read_mostly;
 
 struct event_constraint {
@@ -1550,41 +1585,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
 }
 
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-	unsigned long offset, addr = (unsigned long)from;
-	int type = in_nmi() ? KM_NMI : KM_IRQ0;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page, type);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map, type);
-		put_page(page);
-
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
-}
-
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 {
 	unsigned long bytes;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 44f6ed42a93..7eb78be3b22 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -547,7 +547,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	x86_pmu_disable_event(event);
 
 	if (unlikely(event->attr.precise))
-		intel_pmu_pebs_disable(hwc);
+		intel_pmu_pebs_disable(event);
 }
 
 static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
@@ -600,7 +600,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	}
 
 	if (unlikely(event->attr.precise))
-		intel_pmu_pebs_enable(hwc);
+		intel_pmu_pebs_enable(event);
 
 	__x86_pmu_enable_event(hwc);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 0d994ef213b..50e6ff3281f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -331,26 +331,32 @@ intel_pebs_constraints(struct perf_event *event)
 	return &emptyconstraint;
 }
 
-static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)
+static void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 val = cpuc->pebs_enabled;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	val |= 1ULL << hwc->idx;
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+
+	intel_pmu_lbr_enable(event);
 }
 
-static void intel_pmu_pebs_disable(struct hw_perf_event *hwc)
+static void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 val = cpuc->pebs_enabled;
 
 	val &= ~(1ULL << hwc->idx);
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+
+	intel_pmu_lbr_disable(event);
 }
 
 static void intel_pmu_pebs_enable_all(void)
@@ -369,6 +375,70 @@ static void intel_pmu_pebs_disable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 }
 
+#include <asm/insn.h>
+
+#define MAX_INSN_SIZE	16
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+	return ip > PAGE_OFFSET;
+#else
+	return (long)ip < 0;
+#endif
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long from = cpuc->lbr_entries[0].from;
+	unsigned long old_to, to = cpuc->lbr_entries[0].to;
+	unsigned long ip = regs->ip;
+
+	if (!cpuc->lbr_stack.nr || !from || !to)
+		return 0;
+
+	if (ip < to)
+		return 0;
+
+	/*
+	 * We sampled a branch insn, rewind using the LBR stack
+	 */
+	if (ip == to) {
+		regs->ip = from;
+		return 1;
+	}
+
+	do {
+		struct insn insn;
+		u8 buf[MAX_INSN_SIZE];
+		void *kaddr;
+
+		old_to = to;
+		if (!kernel_ip(ip)) {
+			int bytes, size = min_t(int, MAX_INSN_SIZE, ip - to);
+
+			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+			if (bytes != size)
+				return 0;
+
+			kaddr = buf;
+		} else
+			kaddr = (void *)to;
+
+		kernel_insn_init(&insn, kaddr);
+		insn_get_length(&insn);
+		to += insn.length;
+	} while (to < ip);
+
+	if (to == ip) {
+		regs->ip = old_to;
+		return 1;
+	}
+
+	return 0;
+}
+
 static int intel_pmu_save_and_restart(struct perf_event *event);
 static void intel_pmu_disable_event(struct perf_event *event);
 
@@ -424,6 +494,11 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	regs.bp = at->bp;
 	regs.sp = at->sp;
 
+	if (intel_pmu_pebs_fixup_ip(&regs))
+		regs.flags |= PERF_EFLAGS_EXACT;
+	else
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+
 	if (perf_event_overflow(event, 1, &data, &regs))
 		intel_pmu_disable_event(event);
 
@@ -487,6 +562,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		regs.bp = at->bp;
 		regs.sp = at->sp;
 
+		if (intel_pmu_pebs_fixup_ip(&regs))
+			regs.flags |= PERF_EFLAGS_EXACT;
+		else
+			regs.flags &= ~PERF_EFLAGS_EXACT;
+
 		if (perf_event_overflow(event, 1, &data, &regs))
 			intel_pmu_disable_event(event);
 	}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ab4fd9ede26..be85f7c4a94 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -294,6 +294,12 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_USER			(2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
 
+#define PERF_RECORD_MISC_EXACT			(1 << 14)
+/*
+ * Reserve the last bit to indicate some extended misc field
+ */
+#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)
+
 struct perf_event_header {
 	__u32	type;
 	__u16	misc;
-- 
cgit v1.2.3-70-g09d2


From 8db909a7e3c888b5d45aef7650d74ccebe3ce725 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Mar 2010 17:07:40 +0100
Subject: perf, x86: Clean up IA32_PERF_CAPABILITIES usage

Saner PERF_CAPABILITIES support, which also exposes pebs_trap. Use that
latter to make PEBS's use of LBR conditional since a fault-like pebs
should already report the correct IP.

( As of this writing there is no known hardware that implements
  !pebs_trap )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.770650663@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c           | 15 +++++++++++++--
 arch/x86/kernel/cpu/perf_event_intel.c     | 10 ++++++++++
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 30 +++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 18 ++++--------------
 4 files changed, 42 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5cb4e8dcee4..7b5430b2efe 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -154,6 +154,17 @@ struct cpu_hw_events {
 #define for_each_event_constraint(e, c)	\
 	for ((e) = (c); (e)->cmask; (e)++)
 
+union perf_capabilities {
+	struct {
+		u64	lbr_format    : 6;
+		u64	pebs_trap     : 1;
+		u64	pebs_arch_reg : 1;
+		u64	pebs_format   : 4;
+		u64	smm_freeze    : 1;
+	};
+	u64	capabilities;
+};
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -195,7 +206,8 @@ struct x86_pmu {
 	/*
 	 * Intel Arch Perfmon v2+
 	 */
-	u64		intel_ctrl;
+	u64			intel_ctrl;
+	union perf_capabilities intel_cap;
 
 	/*
 	 * Intel DebugStore bits
@@ -210,7 +222,6 @@ struct x86_pmu {
 	 */
 	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
 	int		lbr_nr;			   /* hardware stack size */
-	int		lbr_format;		   /* hardware format     */
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7eb78be3b22..246c0723882 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -835,6 +835,16 @@ static __init int intel_pmu_init(void)
 	if (version > 1)
 		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
 
+	/*
+	 * v2 and above have a perf capabilities MSR
+	 */
+	if (version > 1) {
+		u64 capabilities;
+
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+		x86_pmu.intel_cap.capabilities = capabilities;
+	}
+
 	intel_ds_init();
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 50e6ff3281f..5e4029441b2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -342,7 +342,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	val |= 1ULL << hwc->idx;
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
-	intel_pmu_lbr_enable(event);
+	if (x86_pmu.intel_cap.pebs_trap)
+		intel_pmu_lbr_enable(event);
 }
 
 static void intel_pmu_pebs_disable(struct perf_event *event)
@@ -356,7 +357,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-	intel_pmu_lbr_disable(event);
+	if (x86_pmu.intel_cap.pebs_trap)
+		intel_pmu_lbr_disable(event);
 }
 
 static void intel_pmu_pebs_enable_all(void)
@@ -395,6 +397,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 
+	/*
+	 * We don't need to fixup if the PEBS assist is fault like
+	 */
+	if (!x86_pmu.intel_cap.pebs_trap)
+		return 1;
+
 	if (!cpuc->lbr_stack.nr || !from || !to)
 		return 0;
 
@@ -589,34 +597,26 @@ static void intel_ds_init(void)
 	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
 	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
 	if (x86_pmu.pebs) {
-		int format = 0;
-
-		if (x86_pmu.version > 1) {
-			u64 capabilities;
-			/*
-			 * v2+ has a PEBS format field
-			 */
-			rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-			format = (capabilities >> 8) & 0xf;
-		}
+		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+		int format = x86_pmu.intel_cap.pebs_format;
 
 		switch (format) {
 		case 0:
-			printk(KERN_CONT "PEBS v0, ");
+			printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
 			x86_pmu.pebs_constraints = intel_core_pebs_events;
 			break;
 
 		case 1:
-			printk(KERN_CONT "PEBS v1, ");
+			printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
 			break;
 
 		default:
-			printk(KERN_CONT "PEBS unknown format: %d, ", format);
+			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
 			break;
 		}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index ea3e99ed82c..4f3a124329c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -53,7 +53,7 @@ static void intel_pmu_lbr_reset_64(void)
 
 static void intel_pmu_lbr_reset(void)
 {
-	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_reset_32();
 	else
 		intel_pmu_lbr_reset_64();
@@ -155,6 +155,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
+	int lbr_format = x86_pmu.intel_cap.lbr_format;
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
@@ -165,7 +166,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
-		if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) {
+		if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
 			flags = !!(from & LBR_FROM_FLAG_MISPRED);
 			from = (u64)((((s64)from) << 1) >> 1);
 		}
@@ -184,23 +185,14 @@ static void intel_pmu_lbr_read(void)
 	if (!cpuc->lbr_users)
 		return;
 
-	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_read_32(cpuc);
 	else
 		intel_pmu_lbr_read_64(cpuc);
 }
 
-static int intel_pmu_lbr_format(void)
-{
-	u64 capabilities;
-
-	rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-	return capabilities & 0x1f;
-}
-
 static void intel_pmu_lbr_init_core(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr     = 4;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x40;
@@ -209,7 +201,6 @@ static void intel_pmu_lbr_init_core(void)
 
 static void intel_pmu_lbr_init_nhm(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr     = 16;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x680;
@@ -218,7 +209,6 @@ static void intel_pmu_lbr_init_nhm(void)
 
 static void intel_pmu_lbr_init_atom(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr	   = 8;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x40;
-- 
cgit v1.2.3-70-g09d2


From 7e1a40dda619b0483fbe0740494ed2c2a1f05289 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 12:38:03 +0100
Subject: perf, x86: Expose the full PEBS record using PERF_SAMPLE_RAW

Expose the full PEBS record using PERF_SAMPLE_RAW

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.847218224@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5e4029441b2..ef56f053ab3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -457,6 +457,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
 	struct perf_sample_data data;
+	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int n;
 
@@ -479,6 +480,12 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.size = x86_pmu.pebs_record_size;
+		raw.data = at;
+		data.raw = &raw;
+	}
+
 	n = top - at;
 
 	/*
@@ -521,6 +528,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct pebs_record_nhm *at, *top;
 	struct perf_sample_data data;
 	struct perf_event *event = NULL;
+	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int bit, n;
 
@@ -562,6 +570,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
+		if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+			raw.size = x86_pmu.pebs_record_size;
+			raw.data = at;
+			data.raw = &raw;
+		}
+
 		/*
 		 * See the comment in intel_pmu_drain_pebs_core()
 		 */
-- 
cgit v1.2.3-70-g09d2


From 30a813ae035d3e220a89609adce878e045c49547 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 13:49:21 +0100
Subject: x86: Move MAX_INSN_SIZE into asm/insn.h

Since there's now two users for this, place it in a common header.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.923774125@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/insn.h               | 2 ++
 arch/x86/include/asm/kprobes.h            | 2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 --
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 96c2e0ad04c..88c765e1641 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -68,6 +68,8 @@ struct insn {
 	const insn_byte_t *next_byte;
 };
 
+#define MAX_INSN_SIZE	16
+
 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
 #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
 #define X86_MODRM_RM(modrm) ((modrm) & 0x07)
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4ffa345a8cc..54788253915 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -24,6 +24,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <asm/insn.h>
 
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
@@ -36,7 +37,6 @@ typedef u8 kprobe_opcode_t;
 #define RELATIVEJUMP_SIZE 5
 #define RELATIVECALL_OPCODE 0xe8
 #define RELATIVE_ADDR_SIZE 4
-#define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR)					       \
 	(((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ef56f053ab3..72453ac5fb7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -379,8 +379,6 @@ static void intel_pmu_pebs_disable_all(void)
 
 #include <asm/insn.h>
 
-#define MAX_INSN_SIZE	16
-
 static inline bool kernel_ip(unsigned long ip)
 {
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3-70-g09d2


From 3adaebd69557615c1bf0365ce5e32d93ac7d82af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Mar 2010 12:09:29 +0100
Subject: perf, x86: Fix silly bug in data store buffer allocation

Fix up the ds allocation error path, where we could free @buffer before
we used it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.813452402@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 72453ac5fb7..a67fff14475 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -127,10 +127,8 @@ static int reserve_ds_buffers(void)
 
 		err = -ENOMEM;
 		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
+		if (unlikely(!ds))
 			break;
-		}
 		per_cpu(cpu_hw_events, cpu).ds = ds;
 
 		if (x86_pmu.bts) {
-- 
cgit v1.2.3-70-g09d2


From 3c44780b220e876b01e39d4028cd6f4205fbf5d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 21:49:01 +0100
Subject: perf, x86: Disable PEBS on clovertown chips

This CPU has just too many handycaps to be really useful.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.890278662@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  4 ++++
 arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7b5430b2efe..335ee1d38b7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -197,6 +197,7 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
+	void		(*quirks)(void);
 
 	void		(*cpu_prepare)(int cpu);
 	void		(*cpu_starting)(int cpu);
@@ -1373,6 +1374,9 @@ void __init init_hw_perf_events(void)
 
 	pr_cont("%s PMU driver.\n", x86_pmu.name);
 
+	if (x86_pmu.quirks)
+		x86_pmu.quirks();
+
 	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
 		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 246c0723882..224c952071f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -792,6 +792,32 @@ static __initconst struct x86_pmu intel_pmu = {
 	.cpu_dying		= fini_debug_store_on_cpu,
 };
 
+static void intel_clovertown_quirks(void)
+{
+	/*
+	 * PEBS is unreliable due to:
+	 *
+	 *   AJ67  - PEBS may experience CPL leaks
+	 *   AJ68  - PEBS PMI may be delayed by one event
+	 *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+	 *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+	 *
+	 * AJ67 could be worked around by restricting the OS/USR flags.
+	 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+	 *
+	 * AJ106 could possibly be worked around by not allowing LBR
+	 *       usage from PEBS, including the fixup.
+	 * AJ68  could possibly be worked around by always programming
+	 * 	 a pebs_event_reset[0] value and coping with the lost events.
+	 *
+	 * But taken together it might just make sense to not enable PEBS on
+	 * these chips.
+	 */
+	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+	x86_pmu.pebs = 0;
+	x86_pmu.pebs_constraints = NULL;
+}
+
 static __init int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
@@ -856,6 +882,7 @@ static __init int intel_pmu_init(void)
 		break;
 
 	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+		x86_pmu.quirks = intel_clovertown_quirks;
 	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
 	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
 	case 29: /* six-core 45 nm xeon "Dunnington" */
-- 
cgit v1.2.3-70-g09d2


From 74846d35b24b6efd61bb88a0a750b6bb257e6e78 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Mar 2010 13:49:35 +0100
Subject: perf, x86: Clear the LBRs on init

Some CPUs have errata where the LBR is not cleared on Power-On. So always
clear the LBRs before use.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.966563424@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c     | 18 ++++++++++++++++--
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  3 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 224c952071f..c135ed735b2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -767,6 +767,20 @@ static __initconst struct x86_pmu core_pmu = {
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static void intel_pmu_cpu_starting(int cpu)
+{
+	init_debug_store_on_cpu(cpu);
+	/*
+	 * Deal with CPUs that don't clear their LBRs on power-up.
+	 */
+	intel_pmu_lbr_reset();
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+	fini_debug_store_on_cpu(cpu);
+}
+
 static __initconst struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -788,8 +802,8 @@ static __initconst struct x86_pmu intel_pmu = {
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
 
-	.cpu_starting		= init_debug_store_on_cpu,
-	.cpu_dying		= fini_debug_store_on_cpu,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
 };
 
 static void intel_clovertown_quirks(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 4f3a124329c..dcec765f818 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -53,6 +53,9 @@ static void intel_pmu_lbr_reset_64(void)
 
 static void intel_pmu_lbr_reset(void)
 {
+	if (!x86_pmu.lbr_nr)
+		return;
+
 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_reset_32();
 	else
-- 
cgit v1.2.3-70-g09d2


From a562b1871f7f7d2f3a835c3c1e07fa58d473cfb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Mar 2010 16:29:14 +0100
Subject: perf, x86: Robustify PEBS fixup

It turns out the LBR is massively unreliable on certain CPUs, so code the
fixup a little more defensive to avoid crashing the kernel.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154129.042271287@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a67fff14475..e7ac51770d4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -399,10 +399,23 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	if (!x86_pmu.intel_cap.pebs_trap)
 		return 1;
 
+	/*
+	 * No LBR entry, no basic block, no rewinding
+	 */
 	if (!cpuc->lbr_stack.nr || !from || !to)
 		return 0;
 
-	if (ip < to)
+	/*
+	 * Basic blocks should never cross user/kernel boundaries
+	 */
+	if (kernel_ip(ip) != kernel_ip(to))
+		return 0;
+
+	/*
+	 * unsigned math, either ip is before the start (impossible) or
+	 * the basic block is larger than 1 page (sanity)
+	 */
+	if ((ip - to) > PAGE_SIZE)
 		return 0;
 
 	/*
@@ -420,7 +433,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
 		old_to = to;
 		if (!kernel_ip(ip)) {
-			int bytes, size = min_t(int, MAX_INSN_SIZE, ip - to);
+			int bytes, size = MAX_INSN_SIZE;
 
 			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
 			if (bytes != size)
@@ -440,6 +453,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		return 1;
 	}
 
+	/*
+	 * Even though we decoded the basic block, the instruction stream
+	 * never matched the given IP, either the TO or the IP got corrupted.
+	 */
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From cc7f00820b2f3be656569c41158d9323e425bcfe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 17:51:33 +0100
Subject: perf, x86: Avoid double disable on throttle vs
 ioctl(PERF_IOC_DISABLE)

Calling ioctl(PERF_EVENT_IOC_DISABLE) on a thottled counter would result
in a double disable, cure this by using x86_pmu_{start,stop} for
throttle/unthrottle and teach x86_pmu_stop() to check ->active_mask.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index e7ac51770d4..a7401e4167d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,6 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 }
 
 static int intel_pmu_save_and_restart(struct perf_event *event);
-static void intel_pmu_disable_event(struct perf_event *event);
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 {
@@ -528,7 +527,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
 	if (perf_event_overflow(event, 1, &data, &regs))
-		intel_pmu_disable_event(event);
+		x86_pmu_stop(event);
 
 out:
 	intel_pmu_pebs_enable_all();
@@ -603,7 +602,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			regs.flags &= ~PERF_EFLAGS_EXACT;
 
 		if (perf_event_overflow(event, 1, &data, &regs))
-			intel_pmu_disable_event(event);
+			x86_pmu_stop(event);
 	}
 out:
 	intel_pmu_pebs_enable_all();
-- 
cgit v1.2.3-70-g09d2


From 8f4aebd2be9892bf8fb79a2d8576d3f3ee7f00f6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 13:26:11 +0100
Subject: perf, x86: Fix pebs drains

I overlooked the perf_disable()/perf_enable() calls in
intel_pmu_handle_irq(), (pointed out by Markus) so we should not
explicitly disable_all/enable_all pebs counters in the drain functions,
these are already disabled and enabling them early is confusing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a7401e4167d..66c6962f15f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -476,18 +476,16 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event || !ds || !x86_pmu.pebs)
 		return;
 
-	intel_pmu_pebs_disable_all();
-
 	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
 
 	if (top <= at)
-		goto out;
+		return;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
 	if (!intel_pmu_save_and_restart(event))
-		goto out;
+		return;
 
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
@@ -528,9 +526,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 
 	if (perf_event_overflow(event, 1, &data, &regs))
 		x86_pmu_stop(event);
-
-out:
-	intel_pmu_pebs_enable_all();
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@ -547,13 +542,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	if (!ds || !x86_pmu.pebs)
 		return;
 
-	intel_pmu_pebs_disable_all();
-
 	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
 	if (top <= at)
-		goto out;
+		return;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
@@ -604,8 +597,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		if (perf_event_overflow(event, 1, &data, &regs))
 			x86_pmu_stop(event);
 	}
-out:
-	intel_pmu_pebs_enable_all();
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4807e3d5dc7bb7057dd6ca3abb09f3da2eb8c323 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 13:47:07 +0100
Subject: perf, x86: Fix PEBS enable/disable vs cpuc->enabled

We should never call ->enable with the pmu enabled, and we _can_ have
->disable called with the pmu enabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 66c6962f15f..9ad0e67b9c8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -338,7 +338,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	val |= 1ULL << hwc->idx;
-	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap)
 		intel_pmu_lbr_enable(event);
@@ -351,7 +351,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 	u64 val = cpuc->pebs_enabled;
 
 	val &= ~(1ULL << hwc->idx);
-	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+	if (cpuc->enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-- 
cgit v1.2.3-70-g09d2


From 2df202bf7520eaffcbfb07e45dfa3cfb0aeee2c0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 13:48:54 +0100
Subject: perf, x86: Fix LBR enable/disable vs cpuc->enabled

We should never call ->enable with the pmu enabled, and we _can_ have
->disable called with the pmu enabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index dcec765f818..0145f99f7a3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -69,7 +69,7 @@ static void intel_pmu_lbr_enable(struct perf_event *event)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	WARN_ON(cpuc->enabled);
+	WARN_ON_ONCE(cpuc->enabled);
 
 	/*
 	 * Reset the LBR stack if this is the first LBR user or
@@ -93,9 +93,10 @@ static void intel_pmu_lbr_disable(struct perf_event *event)
 		return;
 
 	cpuc->lbr_users--;
-
 	BUG_ON(cpuc->lbr_users < 0);
-	WARN_ON(cpuc->enabled);
+
+	if (cpuc->enabled && !cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
 }
 
 static void intel_pmu_lbr_enable_all(void)
-- 
cgit v1.2.3-70-g09d2


From d329527e47851f84b1e7944ed9601205f35f1b93 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 13:57:14 +0100
Subject: perf, x86: Reorder intel_pmu_enable_all()

The documentation says we have to enable PEBS before we enable the PMU
proper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c135ed735b2..d3e2424069a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -487,6 +487,8 @@ static void intel_pmu_enable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all();
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
 	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -498,9 +500,6 @@ static void intel_pmu_enable_all(void)
 
 		intel_pmu_enable_bts(event->hw.config);
 	}
-
-	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
-- 
cgit v1.2.3-70-g09d2


From 12ab854d744f04bfc5c6c4db723b7e31fc03eb29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 18:57:38 +0100
Subject: perf, x86: Deal with multiple state bits for pebs-fmt1

Its unclear if the PEBS state record will have only a single bit set, in
case it does not and accumulates bits, deal with that by only processing
each event once.

Also, robustify some of the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 9ad0e67b9c8..b4680daecf1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -538,6 +538,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct perf_event *event = NULL;
 	struct perf_raw_record raw;
 	struct pt_regs regs;
+	u64 status = 0;
 	int bit, n;
 
 	if (!ds || !x86_pmu.pebs)
@@ -561,13 +562,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 	for ( ; at < top; at++) {
 		for_each_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
-			if (!cpuc->events[bit]->attr.precise)
+			event = cpuc->events[bit];
+			if (!test_bit(bit, cpuc->active_mask))
 				continue;
 
-			event = cpuc->events[bit];
+			WARN_ON_ONCE(!event);
+
+			if (!event->attr.precise)
+				continue;
+
+			if (__test_and_set_bit(bit, (unsigned long *)&status))
+				continue;
+
+			break;
 		}
 
-		if (!event)
+		if (!event || bit >= MAX_PEBS_EVENTS)
 			continue;
 
 		if (!intel_pmu_save_and_restart(event))
-- 
cgit v1.2.3-70-g09d2


From ad0e6cfe2a2a61d7b5530188e571d508146cb43b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 19:49:06 +0100
Subject: perf, x86: Fix silly bug in intel_pmu_pebs_{enable,disable}

We need to use the actual cpuc->pebs_enabled value, not a local copy for
the changes to take effect.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b4680daecf1..242369488e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -333,11 +333,10 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 val = cpuc->pebs_enabled;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
-	val |= 1ULL << hwc->idx;
+	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap)
@@ -348,11 +347,10 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 val = cpuc->pebs_enabled;
 
-	val &= ~(1ULL << hwc->idx);
+	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
 	if (cpuc->enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-- 
cgit v1.2.3-70-g09d2


From b83a46e7da4a948cc852ba7805dfb1a392dec861 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 13:51:12 +0100
Subject: perf, x86: Don't reset the LBR as frequently

If we reset the LBR on each first counter, simple counter rotation which
first deschedules all counters and then reschedules the new ones will
lead to LBR reset, even though we're still in the same task context.

Reduce this by not flushing on the first counter but only flushing on
different task contexts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 0145f99f7a3..f278136bf91 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -72,12 +72,11 @@ static void intel_pmu_lbr_enable(struct perf_event *event)
 	WARN_ON_ONCE(cpuc->enabled);
 
 	/*
-	 * Reset the LBR stack if this is the first LBR user or
-	 * we changed task context so as to avoid data leaks.
+	 * Reset the LBR stack if we changed task context to
+	 * avoid data leaks.
 	 */
 
-	if (!cpuc->lbr_users ||
-	    (event->ctx->task && cpuc->lbr_context != event->ctx)) {
+	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
 		intel_pmu_lbr_reset();
 		cpuc->lbr_context = event->ctx;
 	}
@@ -93,7 +92,7 @@ static void intel_pmu_lbr_disable(struct perf_event *event)
 		return;
 
 	cpuc->lbr_users--;
-	BUG_ON(cpuc->lbr_users < 0);
+	WARN_ON_ONCE(cpuc->lbr_users < 0);
 
 	if (cpuc->enabled && !cpuc->lbr_users)
 		__intel_pmu_lbr_disable();
-- 
cgit v1.2.3-70-g09d2


From 7645a24cbd01cbf4865d1273d5ddaa8d8c2ccb3a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 13:51:31 +0100
Subject: perf, x86: Remove checking_{wr,rd}msr() usage

We don't need checking_{wr,rd}msr() calls, since we should know what cpu
we're running on and not use blindly poke at msrs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 24 ++++++++++++++++++------
 arch/x86/kernel/cpu/perf_event_intel.c |  5 ++---
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 335ee1d38b7..e24f6374f9f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -29,6 +29,17 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val) 					\
+do {								\
+	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
+			(unsigned long)(val));			\
+	native_write_msr((msr), (u32)((u64)(val)), 		\
+			(u32)((u64)(val) >> 32));		\
+} while (0)
+#endif
+
 /*
  * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
  */
@@ -821,14 +832,15 @@ void hw_perf_enable(void)
 
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
 {
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+	wrmsrl(hwc->config_base + hwc->idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 static inline void x86_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+
+	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -843,7 +855,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	s64 left = atomic64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
-	int err, ret = 0, idx = hwc->idx;
+	int ret = 0, idx = hwc->idx;
 
 	if (idx == X86_PMC_IDX_FIXED_BTS)
 		return 0;
@@ -881,8 +893,8 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	err = checking_wrmsrl(hwc->event_base + idx,
-			     (u64)(-left) & x86_pmu.event_mask);
+	wrmsrl(hwc->event_base + idx,
+			(u64)(-left) & x86_pmu.event_mask);
 
 	perf_event_update_userpage(event);
 
@@ -987,7 +999,7 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
 	}
-	pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
 	for (idx = 0; idx < x86_pmu.num_events; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index d3e2424069a..971dc6e7d54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -525,7 +525,7 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
-	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
+	wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void intel_pmu_disable_event(struct perf_event *event)
@@ -553,7 +553,6 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
-	int err;
 
 	/*
 	 * Enable IRQ generation (0x8),
@@ -578,7 +577,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
 	ctrl_val |= bits;
-	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+	wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void intel_pmu_enable_event(struct perf_event *event)
-- 
cgit v1.2.3-70-g09d2


From d80c7502ff63aa0d99d8c0c5803d28bbef67a74e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 9 Mar 2010 11:41:02 +0100
Subject: perf, x86: Fixup the PEBS handler for Core2 cpus

Pull the core handler in line with the nhm one, also make sure we always
drain the buffer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 38 +++++++++++++++++++------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 242369488e7..1bfd59beb65 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -472,20 +472,39 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct pt_regs regs;
 	int n;
 
-	if (!event || !ds || !x86_pmu.pebs)
+	if (!ds || !x86_pmu.pebs)
 		return;
 
 	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
 
-	if (top <= at)
+	/*
+	 * Whatever else happens, drain the thing
+	 */
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!test_bit(0, cpuc->active_mask))
 		return;
 
-	ds->pebs_index = ds->pebs_buffer_base;
+	WARN_ON_ONCE(!event);
+
+	if (!event->attr.precise)
+		return;
+
+	n = top - at;
+	if (n <= 0)
+		return;
 
 	if (!intel_pmu_save_and_restart(event))
 		return;
 
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > 1);
+	at += n - 1;
+
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
@@ -495,14 +514,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		data.raw = &raw;
 	}
 
-	n = top - at;
-
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ON_ONCE(n > 1);
-
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
@@ -545,12 +556,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
-	if (top <= at)
-		return;
-
 	ds->pebs_index = ds->pebs_buffer_base;
 
 	n = top - at;
+	if (n <= 0)
+		return;
 
 	/*
 	 * Should not happen, we program the threshold at 1 and do not
-- 
cgit v1.2.3-70-g09d2


From 63fb3f9b2312e131be5a0a2dddb63f2fb123db9b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 9 Mar 2010 11:51:02 +0100
Subject: perf, x86: Fix LBR read-out

Don't decrement the TOS twice...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index f278136bf91..df4c98e26c5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -129,7 +129,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		union {
 			struct {
@@ -162,7 +162,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		u64 from, to, flags = 0;
 
-- 
cgit v1.2.3-70-g09d2


From caa0142d84ceb0fc83e28f0475d0a7316cb6df77 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 13:58:12 +0200
Subject: perf, x86: Fix the !CONFIG_CPU_SUP_INTEL build

Fix typo. But the modularization here is ugly and should be improved.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1bfd59beb65..c59678a14a2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -661,7 +661,7 @@ static void intel_ds_init(void)
 
 #else /* CONFIG_CPU_SUP_INTEL */
 
-static int reseve_ds_buffers(void)
+static int reserve_ds_buffers(void)
 {
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 938179b4f8cf8a4f11234ebf2dff2eb48400acfe Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 5 Mar 2010 11:42:03 -0600
Subject: x86: Improve Intel microcode loader performance

We've noticed that on large SGI UV system configurations,
running microcode.ctl can take very long periods of time.  This
is due to the large number of vmalloc/vfree calls made by the
Intel generic_load_microcode() logic.

By reusing allocated space, the following patch reduces the time
to run microcode.ctl on a 1024 cpu system from approximately 80
seconds down to 1 or 2 seconds.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Bill Davidsen <davidsen@tmr.com>
LKML-Reference: <20100305174203.GA19638@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_intel.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e2893..356170262a9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				int (*get_ucode_data)(void *, const void *, size_t))
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
 	enum ucode_state state = UCODE_OK;
+	unsigned int curr_mc_size = 0;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 			break;
 		}
 
-		mc = vmalloc(mc_size);
-		if (!mc)
-			break;
+		/* For performance reasons, reuse mc area when possible */
+		if (!mc || mc_size > curr_mc_size) {
+			if (mc)
+				vfree(mc);
+			mc = vmalloc(mc_size);
+			if (!mc)
+				break;
+			curr_mc_size = mc_size;
+		}
 
 		if (get_ucode_data(mc, ucode_ptr, mc_size) ||
 		    microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
-		} else
-			vfree(mc);
+			mc = NULL;	/* trigger new vmalloc */
+		}
 
 		ucode_ptr += mc_size;
 		leftover  -= mc_size;
 	}
 
+	if (mc)
+		vfree(mc);
+
 	if (leftover) {
 		if (new_mc)
 			vfree(new_mc);
-- 
cgit v1.2.3-70-g09d2


From 6f4edd69e40aba4f45bf9558c1e9a950d79ab4e4 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 10 Mar 2010 14:44:58 -0600
Subject: x86, UV: Clean up UV headers for MMR definitions

Update UV mmr definitions header file. Eliminate definitions no
longer needed. Move 2 definitions from tlb_uv.c into the header
file where they belong.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100310204458.GA28835@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_mmrs.h | 528 ++++++--------------------------------
 arch/x86/kernel/tlb_uv.c          |  10 +-
 2 files changed, 85 insertions(+), 453 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 2cae46c7c8a..b2f2d2e05ce 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -1,4 +1,3 @@
-
 /*
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -14,14 +13,26 @@
 
 #define UV_MMR_ENABLE		(1UL << 63)
 
+/* ========================================================================= */
+/*                          UVH_BAU_DATA_BROADCAST                           */
+/* ========================================================================= */
+#define UVH_BAU_DATA_BROADCAST 0x61688UL
+#define UVH_BAU_DATA_BROADCAST_32 0x0440
+
+#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
+#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+
+union uvh_bau_data_broadcast_u {
+    unsigned long	v;
+    struct uvh_bau_data_broadcast_s {
+	unsigned long	enable :  1;  /* RW */
+	unsigned long	rsvd_1_63: 63;  /*    */
+    } s;
+};
+
 /* ========================================================================= */
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
 #define UVH_BAU_DATA_CONFIG 0x61680UL
 #define UVH_BAU_DATA_CONFIG_32 0x0438
 
@@ -603,6 +614,68 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
 
+/* ========================================================================= */
+/*                         UVH_LB_BAU_MISC_CONTROL                           */
+/* ========================================================================= */
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10
+
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+union uvh_lb_bau_misc_control_u {
+    unsigned long	v;
+    struct uvh_lb_bau_misc_control_s {
+	unsigned long	rejection_delay                    :  8;  /* RW */
+	unsigned long	apic_mode                          :  1;  /* RW */
+	unsigned long	force_broadcast                    :  1;  /* RW */
+	unsigned long	force_lock_nop                     :  1;  /* RW */
+	unsigned long	csi_agent_presence_vector          :  3;  /* RW */
+	unsigned long	descriptor_fetch_mode              :  1;  /* RW */
+	unsigned long	enable_intd_soft_ack_mode          :  1;  /* RW */
+	unsigned long	intd_soft_ack_timeout_period       :  4;  /* RW */
+	unsigned long	enable_dual_mapping_mode           :  1;  /* RW */
+	unsigned long	vga_io_port_decode_enable          :  1;  /* RW */
+	unsigned long	vga_io_port_16_bit_decode          :  1;  /* RW */
+	unsigned long	suppress_dest_registration         :  1;  /* RW */
+	unsigned long	programmed_initial_priority        :  3;  /* RW */
+	unsigned long	use_incoming_priority              :  1;  /* RW */
+	unsigned long	enable_programmed_initial_priority :  1;  /* RW */
+	unsigned long	rsvd_29_47                         : 19;  /*    */
+	unsigned long	fun                                : 16;  /* RW */
+    } s;
+};
+
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
 /* ========================================================================= */
@@ -680,334 +753,6 @@ union uvh_lb_bau_sb_descriptor_base_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                      UVH_LB_MCAST_AOERR0_RPT_ENABLE                       */
-/* ========================================================================= */
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
-
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
-
-union uvh_lb_mcast_aoerr0_rpt_enable_u {
-    unsigned long	v;
-    struct uvh_lb_mcast_aoerr0_rpt_enable_s {
-	unsigned long	mcast_obese_msg                         :  1;  /* RW */
-	unsigned long	mcast_data_sb_err                       :  1;  /* RW */
-	unsigned long	mcast_nack_buff_parity                  :  1;  /* RW */
-	unsigned long	mcast_timeout                           :  1;  /* RW */
-	unsigned long	mcast_inactive_reply                    :  1;  /* RW */
-	unsigned long	mcast_upgrade_error                     :  1;  /* RW */
-	unsigned long	mcast_reg_count_underflow               :  1;  /* RW */
-	unsigned long	mcast_rep_obese_msg                     :  1;  /* RW */
-	unsigned long	ucache_req_runt_msg                     :  1;  /* RW */
-	unsigned long	ucache_req_obese_msg                    :  1;  /* RW */
-	unsigned long	ucache_req_data_sb_err                  :  1;  /* RW */
-	unsigned long	ucache_rep_runt_msg                     :  1;  /* RW */
-	unsigned long	ucache_rep_obese_msg                    :  1;  /* RW */
-	unsigned long	ucache_rep_data_sb_err                  :  1;  /* RW */
-	unsigned long	ucache_rep_command_err                  :  1;  /* RW */
-	unsigned long	ucache_pend_timeout                     :  1;  /* RW */
-	unsigned long	macc_req_runt_msg                       :  1;  /* RW */
-	unsigned long	macc_req_obese_msg                      :  1;  /* RW */
-	unsigned long	macc_req_data_sb_err                    :  1;  /* RW */
-	unsigned long	macc_rep_runt_msg                       :  1;  /* RW */
-	unsigned long	macc_rep_obese_msg                      :  1;  /* RW */
-	unsigned long	macc_rep_data_sb_err                    :  1;  /* RW */
-	unsigned long	macc_amo_timeout                        :  1;  /* RW */
-	unsigned long	macc_put_timeout                        :  1;  /* RW */
-	unsigned long	macc_spurious_event                     :  1;  /* RW */
-	unsigned long	ioh_destination_table_parity            :  1;  /* RW */
-	unsigned long	get_had_error_reply                     :  1;  /* RW */
-	unsigned long	get_timeout                             :  1;  /* RW */
-	unsigned long	lock_manager_had_error_reply            :  1;  /* RW */
-	unsigned long	put_had_error_reply                     :  1;  /* RW */
-	unsigned long	put_timeout                             :  1;  /* RW */
-	unsigned long	sb_activation_overrun                   :  1;  /* RW */
-	unsigned long	completed_gb_activation_had_error_reply :  1;  /* RW */
-	unsigned long	completed_gb_activation_timeout         :  1;  /* RW */
-	unsigned long	descriptor_buffer_0_parity              :  1;  /* RW */
-	unsigned long	descriptor_buffer_1_parity              :  1;  /* RW */
-	unsigned long	socket_destination_table_parity         :  1;  /* RW */
-	unsigned long	bau_reply_payload_corruption            :  1;  /* RW */
-	unsigned long	io_port_destination_table_parity        :  1;  /* RW */
-	unsigned long	intd_soft_ack_timeout                   :  1;  /* RW */
-	unsigned long	int_rep_obese_msg                       :  1;  /* RW */
-	unsigned long	int_rep_command_err                     :  1;  /* RW */
-	unsigned long	int_timeout                             :  1;  /* RW */
-	unsigned long	rsvd_43_63                              : 21;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_LOCAL_INT0_CONFIG                            */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_CONFIG 0x61000UL
-
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
-#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
-#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
-#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
-#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_local_int0_config_u {
-    unsigned long	v;
-    struct uvh_local_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_LOCAL_INT0_ENABLE                            */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_ENABLE 0x65000UL
-
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-
-union uvh_local_int0_enable_u {
-    unsigned long	v;
-    struct uvh_local_int0_enable_s {
-	unsigned long	lb_hcerr            :  1;  /* RW */
-	unsigned long	gr0_hcerr           :  1;  /* RW */
-	unsigned long	gr1_hcerr           :  1;  /* RW */
-	unsigned long	lh_hcerr            :  1;  /* RW */
-	unsigned long	rh_hcerr            :  1;  /* RW */
-	unsigned long	xn_hcerr            :  1;  /* RW */
-	unsigned long	si_hcerr            :  1;  /* RW */
-	unsigned long	lb_aoerr0           :  1;  /* RW */
-	unsigned long	gr0_aoerr0          :  1;  /* RW */
-	unsigned long	gr1_aoerr0          :  1;  /* RW */
-	unsigned long	lh_aoerr0           :  1;  /* RW */
-	unsigned long	rh_aoerr0           :  1;  /* RW */
-	unsigned long	xn_aoerr0           :  1;  /* RW */
-	unsigned long	si_aoerr0           :  1;  /* RW */
-	unsigned long	lb_aoerr1           :  1;  /* RW */
-	unsigned long	gr0_aoerr1          :  1;  /* RW */
-	unsigned long	gr1_aoerr1          :  1;  /* RW */
-	unsigned long	lh_aoerr1           :  1;  /* RW */
-	unsigned long	rh_aoerr1           :  1;  /* RW */
-	unsigned long	xn_aoerr1           :  1;  /* RW */
-	unsigned long	si_aoerr1           :  1;  /* RW */
-	unsigned long	rh_vpi_int          :  1;  /* RW */
-	unsigned long	system_shutdown_int :  1;  /* RW */
-	unsigned long	lb_irq_int_0        :  1;  /* RW */
-	unsigned long	lb_irq_int_1        :  1;  /* RW */
-	unsigned long	lb_irq_int_2        :  1;  /* RW */
-	unsigned long	lb_irq_int_3        :  1;  /* RW */
-	unsigned long	lb_irq_int_4        :  1;  /* RW */
-	unsigned long	lb_irq_int_5        :  1;  /* RW */
-	unsigned long	lb_irq_int_6        :  1;  /* RW */
-	unsigned long	lb_irq_int_7        :  1;  /* RW */
-	unsigned long	lb_irq_int_8        :  1;  /* RW */
-	unsigned long	lb_irq_int_9        :  1;  /* RW */
-	unsigned long	lb_irq_int_10       :  1;  /* RW */
-	unsigned long	lb_irq_int_11       :  1;  /* RW */
-	unsigned long	lb_irq_int_12       :  1;  /* RW */
-	unsigned long	lb_irq_int_13       :  1;  /* RW */
-	unsigned long	lb_irq_int_14       :  1;  /* RW */
-	unsigned long	lb_irq_int_15       :  1;  /* RW */
-	unsigned long	l1_nmi_int          :  1;  /* RW */
-	unsigned long	stop_clock          :  1;  /* RW */
-	unsigned long	asic_to_l1          :  1;  /* RW */
-	unsigned long	l1_to_asic          :  1;  /* RW */
-	unsigned long	ltc_int             :  1;  /* RW */
-	unsigned long	la_seq_trigger      :  1;  /* RW */
-	unsigned long	rsvd_45_63          : 19;  /*    */
-    } s;
-};
-
 /* ========================================================================= */
 /*                               UVH_NODE_ID                                 */
 /* ========================================================================= */
@@ -1111,26 +856,6 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                    UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR                      */
-/* ========================================================================= */
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
-
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_cfg_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_cfg_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base   : 20;  /* RW */
-	unsigned long	rsvd_46_62: 17;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s;
-};
-
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
@@ -1262,101 +987,6 @@ union uvh_rtc1_int_config_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                           UVH_RTC2_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC2_INT_CONFIG 0x61600UL
-
-#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC2_INT_CONFIG_P_SHFT 13
-#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC2_INT_CONFIG_T_SHFT 15
-#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC2_INT_CONFIG_M_SHFT 16
-#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc2_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc2_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC3_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC3_INT_CONFIG 0x61640UL
-
-#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC3_INT_CONFIG_P_SHFT 13
-#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC3_INT_CONFIG_T_SHFT 15
-#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC3_INT_CONFIG_M_SHFT 16
-#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc3_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc3_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                            UVH_RTC_INC_RATIO                              */
-/* ========================================================================= */
-#define UVH_RTC_INC_RATIO 0x350000UL
-
-#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
-#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
-#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
-#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
-
-union uvh_rtc_inc_ratio_u {
-    unsigned long	v;
-    struct uvh_rtc_inc_ratio_s {
-	unsigned long	fraction : 20;  /* RW */
-	unsigned long	ratio    :  3;  /* RW */
-	unsigned long	rsvd_23_63: 41;  /*    */
-    } s;
-};
-
 /* ========================================================================= */
 /*                          UVH_SI_ADDR_MAP_CONFIG                           */
 /* ========================================================================= */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015efeb..ef68ba48564 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -20,6 +20,8 @@
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
 
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
+
 static struct bau_control	**uv_bau_table_bases __read_mostly;
 static int			uv_bau_retry_limit __read_mostly;
 
@@ -478,16 +480,16 @@ static void uv_enable_timeouts(void)
 		 * To program the period, the SOFT_ACK_MODE must be off.
 		 */
 		mmr_image &= ~((unsigned long)1 <<
-			       UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
 		 * Set the 4-bit period.
 		 */
 		mmr_image &= ~((unsigned long)0xf <<
-			UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-			     UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
@@ -496,7 +498,7 @@ static void uv_enable_timeouts(void)
 		 * indicated in bits 2:0 (7 causes all of them to timeout).
 		 */
 		mmr_image |= ((unsigned long)1 <<
-			      UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 	}
-- 
cgit v1.2.3-70-g09d2


From a072738e04f0eb26370e39ec679e9a0d65e49aea Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 11 Mar 2010 19:54:39 +0300
Subject: perf, x86: Implement initial P4 PMU driver

The netburst PMU is way different from the "architectural
perfomance monitoring" specification that current CPUs use.
P4 uses a tuple of ESCR+CCCR+COUNTER MSR registers to handle
perfomance monitoring events.

A few implementational details:

1) We need a separate x86_pmu::hw_config helper in struct
   x86_pmu since register bit-fields are quite different from P6,
   Core and later cpu series.

2) For the same reason is a x86_pmu::schedule_events helper
   introduced.

3) hw_perf_event::config consists of packed ESCR+CCCR values.
   It's allowed since in reality both registers only use a half
   of their size. Of course before making a real write into a
   particular MSR we need to unpack the value and extend it to
   a proper size.

4) The tuple of packed ESCR+CCCR in hw_perf_event::config
   doesn't describe the memory address of ESCR MSR register
   so that we need to keep a mapping between these tuples
   used and available ESCR (various P4 events may use same
   ESCRs but not simultaneously), for this sake every active
   event has a per-cpu map of hw_perf_event::idx <--> ESCR
   addresses.

5) Since hw_perf_event::idx is an offset to counter/control register
   we need to lift X86_PMC_MAX_GENERIC up, otherwise kernel
   strips it down to 8 registers and event armed may never be turned
   off (ie the bit in active_mask is set but the loop never reaches
   this index to check), thanks to Peter Zijlstra

Restrictions:

 - No cascaded counters support (do we ever need them?)
 - No dependent events support (so PERF_COUNT_HW_INSTRUCTIONS
   doesn't work for now)
 - There are events with same counters which can't work simultaneously
   (need to use intersected ones due to broken counter 1)
 - No PERF_COUNT_HW_CACHE_ events yet

Todo:

 - Implement dependent events
 - Need proper hashing for event opcodes (no linear search, good for
   debugging stage but not in real loads)
 - Some events counted during a clock cycle -- need to set threshold
   for them and count every clock cycle just to get summary statistics
   (ie to behave the same way as other PMUs do)
 - Need to swicth to use event_constraints
 - To support RAW events we need to encode a global list of P4 events
   into p4_templates
 - Cache events need to be added

Event support status matrix:

 Event			status
 -----------------------------
 cycles			works
 cache-references	works
 cache-misses		works
 branch-misses		works
 bus-cycles		partially (does not work on 64bit cpu with HT enabled)
 instruction		doesnt work (needs dependent event [mop tagging])
 branches		doesnt work

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100311165439.GB5129@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      |   2 +-
 arch/x86/include/asm/perf_event_p4.h   | 707 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.c       |  46 ++-
 arch/x86/kernel/cpu/perf_event_amd.c   |   2 +
 arch/x86/kernel/cpu/perf_event_intel.c |  15 +-
 arch/x86/kernel/cpu/perf_event_p4.c    | 607 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_p6.c    |   2 +
 7 files changed, 1358 insertions(+), 23 deletions(-)
 create mode 100644 arch/x86/include/asm/perf_event_p4.h
 create mode 100644 arch/x86/kernel/cpu/perf_event_p4.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index a9038c95161..124dddd598f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,7 +5,7 @@
  * Performance event hw details:
  */
 
-#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_GENERIC				       32
 #define X86_PMC_MAX_FIXED					3
 
 #define X86_PMC_IDX_GENERIC				        0
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 00000000000..829f4711645
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,707 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ */
+
+#ifndef PERF_EVENT_P4_H
+#define PERF_EVENT_P4_H
+
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+
+/*
+ * NetBurst has perfomance MSRs shared between
+ * threads if HT is turned on, ie for both logical
+ * processors (mem: in turn in Atom with HT support
+ * perf-MSRs are not shared and every thread has its
+ * own perf-MSRs set)
+ */
+#define ARCH_P4_TOTAL_ESCR		(46)
+#define ARCH_P4_RESERVED_ESCR		(2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR		(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR		(18)
+#define ARCH_P4_MAX_COUNTER		(ARCH_P4_MAX_CCCR / 2)
+
+#define P4_EVNTSEL_EVENT_MASK		0x7e000000U
+#define P4_EVNTSEL_EVENT_SHIFT		25
+#define P4_EVNTSEL_EVENTMASK_MASK	0x01fffe00U
+#define P4_EVNTSEL_EVENTMASK_SHIFT	9
+#define P4_EVNTSEL_TAG_MASK		0x000001e0U
+#define P4_EVNTSEL_TAG_SHIFT		5
+#define P4_EVNTSEL_TAG_ENABLE		0x00000010U
+#define P4_EVNTSEL_T0_OS		0x00000008U
+#define P4_EVNTSEL_T0_USR		0x00000004U
+#define P4_EVNTSEL_T1_OS		0x00000002U
+#define P4_EVNTSEL_T1_USR		0x00000001U
+
+/* Non HT mask */
+#define P4_EVNTSEL_MASK				\
+	(P4_EVNTSEL_EVENT_MASK		|	\
+	P4_EVNTSEL_EVENTMASK_MASK	|	\
+	P4_EVNTSEL_TAG_MASK		|	\
+	P4_EVNTSEL_TAG_ENABLE		|	\
+	P4_EVNTSEL_T0_OS		|	\
+	P4_EVNTSEL_T0_USR)
+
+/* HT mask */
+#define P4_EVNTSEL_MASK_HT			\
+	(P4_EVNTSEL_MASK		|	\
+	P4_EVNTSEL_T1_OS		|	\
+	P4_EVNTSEL_T1_USR)
+
+#define P4_CCCR_OVF			0x80000000U
+#define P4_CCCR_CASCADE			0x40000000U
+#define P4_CCCR_OVF_PMI_T0		0x04000000U
+#define P4_CCCR_OVF_PMI_T1		0x08000000U
+#define P4_CCCR_FORCE_OVF		0x02000000U
+#define P4_CCCR_EDGE			0x01000000U
+#define P4_CCCR_THRESHOLD_MASK		0x00f00000U
+#define P4_CCCR_THRESHOLD_SHIFT		20
+#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_COMPLEMENT		0x00080000U
+#define P4_CCCR_COMPARE			0x00040000U
+#define P4_CCCR_ESCR_SELECT_MASK	0x0000e000U
+#define P4_CCCR_ESCR_SELECT_SHIFT	13
+#define P4_CCCR_ENABLE			0x00001000U
+#define P4_CCCR_THREAD_SINGLE		0x00010000U
+#define P4_CCCR_THREAD_BOTH		0x00020000U
+#define P4_CCCR_THREAD_ANY		0x00030000U
+
+/* Non HT mask */
+#define P4_CCCR_MASK				\
+	(P4_CCCR_OVF			|	\
+	P4_CCCR_CASCADE			|	\
+	P4_CCCR_OVF_PMI_T0		|	\
+	P4_CCCR_FORCE_OVF		|	\
+	P4_CCCR_EDGE			|	\
+	P4_CCCR_THRESHOLD_MASK		|	\
+	P4_CCCR_COMPLEMENT		|	\
+	P4_CCCR_COMPARE			|	\
+	P4_CCCR_ESCR_SELECT_MASK	|	\
+	P4_CCCR_ENABLE)
+
+/* HT mask */
+#define P4_CCCR_MASK_HT				\
+	(P4_CCCR_MASK			|	\
+	P4_CCCR_THREAD_ANY)
+
+/*
+ * format is 32 bit: ee ss aa aa
+ * where
+ *	ee - 8 bit event
+ *	ss - 8 bit selector
+ *	aa aa - 16 bits reserved for tags/attributes
+ */
+#define P4_EVENT_PACK(event, selector)		(((event) << 24) | ((selector) << 16))
+#define P4_EVENT_UNPACK_EVENT(packed)		(((packed) >> 24) & 0xff)
+#define P4_EVENT_UNPACK_SELECTOR(packed)	(((packed) >> 16) & 0xff)
+#define P4_EVENT_PACK_ATTR(attr)		((attr))
+#define P4_EVENT_UNPACK_ATTR(packed)		((packed) & 0xffff)
+#define P4_MAKE_EVENT_ATTR(class, name, bit)	class##_##name = (1 << bit)
+#define P4_EVENT_ATTR(class, name)		class##_##name
+#define P4_EVENT_ATTR_STR(class, name)		__stringify(class##_##name)
+
+/*
+ * config field is 64bit width and consists of
+ * HT << 63 | ESCR << 32 | CCCR
+ * where HT is HyperThreading bit (since ESCR
+ * has it reserved we may use it for own purpose)
+ *
+ * note that this is NOT the addresses of respective
+ * ESCR and CCCR but rather an only packed value should
+ * be unpacked and written to a proper addresses
+ *
+ * the base idea is to pack as much info as
+ * possible
+ */
+#define p4_config_pack_escr(v)		(((u64)(v)) << 32)
+#define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
+
+#define p4_config_unpack_emask(v)			\
+	({						\
+		u32 t = p4_config_unpack_escr((v));	\
+		t  &= P4_EVNTSEL_EVENTMASK_MASK;	\
+		t >>= P4_EVNTSEL_EVENTMASK_SHIFT;	\
+		t;					\
+	})
+
+#define P4_CONFIG_HT_SHIFT		63
+#define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
+
+static inline u32 p4_config_unpack_opcode(u64 config)
+{
+	u32 e, s;
+
+	/*
+	 * we don't care about HT presence here since
+	 * event opcode doesn't depend on it
+	 */
+	e = (p4_config_unpack_escr(config) & P4_EVNTSEL_EVENT_MASK) >> P4_EVNTSEL_EVENT_SHIFT;
+	s = (p4_config_unpack_cccr(config) & P4_CCCR_ESCR_SELECT_MASK) >> P4_CCCR_ESCR_SELECT_SHIFT;
+
+	return P4_EVENT_PACK(e, s);
+}
+
+static inline bool p4_is_event_cascaded(u64 config)
+{
+	u32 cccr = p4_config_unpack_cccr(config);
+	return !!(cccr & P4_CCCR_CASCADE);
+}
+
+static inline int p4_ht_config_thread(u64 config)
+{
+	return !!(config & P4_CONFIG_HT);
+}
+
+static inline u64 p4_set_ht_bit(u64 config)
+{
+	return config | P4_CONFIG_HT;
+}
+
+static inline u64 p4_clear_ht_bit(u64 config)
+{
+	return config & ~P4_CONFIG_HT;
+}
+
+static inline int p4_ht_active(void)
+{
+#ifdef CONFIG_SMP
+	return smp_num_siblings > 1;
+#endif
+	return 0;
+}
+
+static inline int p4_ht_thread(int cpu)
+{
+#ifdef CONFIG_SMP
+	if (smp_num_siblings == 2)
+		return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
+#endif
+	return 0;
+}
+
+static inline int p4_should_swap_ts(u64 config, int cpu)
+{
+	return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
+}
+
+static inline u32 p4_default_cccr_conf(int cpu)
+{
+	/*
+	 * Note that P4_CCCR_THREAD_ANY is "required" on
+	 * non-HT machines (on HT machines we count TS events
+	 * regardless the state of second logical processor
+	 */
+	u32 cccr = P4_CCCR_THREAD_ANY;
+
+	if (!p4_ht_thread(cpu))
+		cccr |= P4_CCCR_OVF_PMI_T0;
+	else
+		cccr |= P4_CCCR_OVF_PMI_T1;
+
+	return cccr;
+}
+
+static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
+{
+	u32 escr = 0;
+
+	if (!p4_ht_thread(cpu)) {
+		if (!exclude_os)
+			escr |= P4_EVNTSEL_T0_OS;
+		if (!exclude_usr)
+			escr |= P4_EVNTSEL_T0_USR;
+	} else {
+		if (!exclude_os)
+			escr |= P4_EVNTSEL_T1_OS;
+		if (!exclude_usr)
+			escr |= P4_EVNTSEL_T1_USR;
+	}
+
+	return escr;
+}
+
+/*
+ * Comments below the event represent ESCR restriction
+ * for this event and counter index per ESCR
+ *
+ * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
+ * processor builds (family 0FH, models 01H-02H). These MSRs
+ * are not available on later versions, so that we don't use
+ * them completely
+ *
+ * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
+ * working so that we should not use this CCCR and respective
+ * counter as result
+ */
+#define P4_TC_DELIVER_MODE		P4_EVENT_PACK(0x01, 0x01)
+	/*
+	 * MSR_P4_TC_ESCR0:	4, 5
+	 * MSR_P4_TC_ESCR1:	6, 7
+	 */
+
+#define P4_BPU_FETCH_REQUEST		P4_EVENT_PACK(0x03, 0x00)
+	/*
+	 * MSR_P4_BPU_ESCR0:	0, 1
+	 * MSR_P4_BPU_ESCR1:	2, 3
+	 */
+
+#define P4_ITLB_REFERENCE		P4_EVENT_PACK(0x18, 0x03)
+	/*
+	 * MSR_P4_ITLB_ESCR0:	0, 1
+	 * MSR_P4_ITLB_ESCR1:	2, 3
+	 */
+
+#define P4_MEMORY_CANCEL		P4_EVENT_PACK(0x02, 0x05)
+	/*
+	 * MSR_P4_DAC_ESCR0:	8, 9
+	 * MSR_P4_DAC_ESCR1:	10, 11
+	 */
+
+#define P4_MEMORY_COMPLETE		P4_EVENT_PACK(0x08, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_LOAD_PORT_REPLAY		P4_EVENT_PACK(0x04, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_STORE_PORT_REPLAY		P4_EVENT_PACK(0x05, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_MOB_LOAD_REPLAY		P4_EVENT_PACK(0x03, 0x02)
+	/*
+	 * MSR_P4_MOB_ESCR0:	0, 1
+	 * MSR_P4_MOB_ESCR1:	2, 3
+	 */
+
+#define P4_PAGE_WALK_TYPE		P4_EVENT_PACK(0x01, 0x04)
+	/*
+	 * MSR_P4_PMH_ESCR0:	0, 1
+	 * MSR_P4_PMH_ESCR1:	2, 3
+	 */
+
+#define P4_BSQ_CACHE_REFERENCE		P4_EVENT_PACK(0x0c, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR0:	0, 1
+	 * MSR_P4_BSU_ESCR1:	2, 3
+	 */
+
+#define P4_IOQ_ALLOCATION		P4_EVENT_PACK(0x03, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_IOQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x1a, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_FSB_DATA_ACTIVITY		P4_EVENT_PACK(0x17, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_BSQ_ALLOCATION		P4_EVENT_PACK(0x05, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR0:	0, 1
+	 */
+
+#define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR1:	2, 3
+	 */
+
+#define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR:	8, 9
+	 * MSR_P4_FIRM_ESCR:	10, 11
+	 */
+
+#define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_PACKED_DP_UOP		P4_EVENT_PACK(0x0c, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_SCALAR_SP_UOP		P4_EVENT_PACK(0x0a, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_SCALAR_DP_UOP		P4_EVENT_PACK(0x0e, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_64BIT_MMX_UOP		P4_EVENT_PACK(0x02, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_128BIT_MMX_UOP		P4_EVENT_PACK(0x1a, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_X87_FP_UOP			P4_EVENT_PACK(0x04, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_TC_MISC			P4_EVENT_PACK(0x06, 0x01)
+	/*
+	 * MSR_P4_TC_ESCR0:	4, 5
+	 * MSR_P4_TC_ESCR1:	6, 7
+	 */
+
+#define P4_GLOBAL_POWER_EVENTS		P4_EVENT_PACK(0x13, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_TC_MS_XFER			P4_EVENT_PACK(0x05, 0x00)
+	/*
+	 * MSR_P4_MS_ESCR0:	4, 5
+	 * MSR_P4_MS_ESCR1:	6, 7
+	 */
+
+#define P4_UOP_QUEUE_WRITES		P4_EVENT_PACK(0x09, 0x00)
+	/*
+	 * MSR_P4_MS_ESCR0:	4, 5
+	 * MSR_P4_MS_ESCR1:	6, 7
+	 */
+
+#define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
+	/*
+	 * MSR_P4_TBPU_ESCR0:	4, 5
+	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 */
+
+#define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
+	/*
+	 * MSR_P4_TBPU_ESCR0:	4, 5
+	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 */
+
+#define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
+	/*
+	 * MSR_P4_ALF_ESCR0:	12, 13, 16
+	 * MSR_P4_ALF_ESCR1:	14, 15, 17
+	 */
+
+#define P4_WC_BUFFER			P4_EVENT_PACK(0x05, 0x05)
+	/*
+	 * MSR_P4_DAC_ESCR0:	8, 9
+	 * MSR_P4_DAC_ESCR1:	10, 11
+	 */
+
+#define P4_B2B_CYCLES			P4_EVENT_PACK(0x16, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_BNR				P4_EVENT_PACK(0x08, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_SNOOP			P4_EVENT_PACK(0x06, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_RESPONSE			P4_EVENT_PACK(0x04, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_FRONT_END_EVENT		P4_EVENT_PACK(0x08, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_EXECUTION_EVENT		P4_EVENT_PACK(0x0c, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_REPLAY_EVENT			P4_EVENT_PACK(0x09, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
+	/*
+	 * MSR_P4_RAT_ESCR0:	12, 13, 16
+	 * MSR_P4_RAT_ESCR1:	14, 15, 17
+	 */
+
+#define P4_BRANCH_RETIRED		P4_EVENT_PACK(0x06, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_MISPRED_BRANCH_RETIRED	P4_EVENT_PACK(0x03, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
+	 */
+
+#define P4_X87_ASSIST			P4_EVENT_PACK(0x03, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_MACHINE_CLEAR		P4_EVENT_PACK(0x02, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_INSTR_COMPLETED		P4_EVENT_PACK(0x07, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
+	 */
+
+/*
+ * a caller should use P4_EVENT_ATTR helper to
+ * pick the attribute needed, for example
+ *
+ *	P4_EVENT_ATTR(P4_TC_DELIVER_MODE, DD)
+ */
+enum P4_EVENTS_ATTR {
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DD, 0),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DB, 1),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DI, 2),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BD, 3),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BB, 4),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BI, 5),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, ID, 6),
+
+	P4_MAKE_EVENT_ATTR(P4_BPU_FETCH_REQUEST, TCMISS, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT, 0),
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, MISS, 1),
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT_UK, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, ST_RB_FULL, 2),
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, 64K_CONF, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, LSC, 0),
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, SSC, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STA, 1),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STD, 3),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, DTMISS, 0),
+	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, ITMISS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, DEFAULT, 0),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_READ, 5),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_WRITE, 6),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_UC, 7),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WC, 8),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WT, 9),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WP, 10),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WB, 11),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OWN, 13),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OTHER, 14),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, PREFETCH, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OWN, 13),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE0, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE1, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN0, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN1, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE0, 11),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE1, 12),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+	P4_MAKE_EVENT_ATTR(P4_SSE_INPUT_ASSIST, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_PACKED_SP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_PACKED_DP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_SCALAR_SP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_SCALAR_DP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_64BIT_MMX_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_128BIT_MMX_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_X87_FP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_TC_MISC, FLUSH, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_TC_MS_XFER, CISC, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL, 2),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN, 3),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_RESOURCE_STALL, SBFULL, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_EVICTS, 0),
+	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS0, 0),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS1, 1),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS2, 2),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS3, 3),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS0, 4),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS1, 5),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS2, 6),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS3, 7),
+
+	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG, 0),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG, 1),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG, 2),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS, 1),
+	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNP, 0),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNM, 1),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTP, 2),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTM, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSU, 0),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSO, 1),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAO, 2),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAU, 3),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, PREA, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, CLEAR, 0),
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, MOCLEAR, 1),
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, SMCLEAR, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
+};
+
+#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e24f6374f9f..e6a3f5f81c9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -190,6 +190,8 @@ struct x86_pmu {
 	void		(*enable_all)(void);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
+	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
+	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
@@ -415,6 +417,25 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
+static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+{
+	/*
+	 * Generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * Count user and OS events unless requested not to
+	 */
+	if (!attr->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!attr->exclude_kernel)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -446,23 +467,13 @@ static int __hw_perf_event_init(struct perf_event *event)
 
 	event->destroy = hw_perf_event_destroy;
 
-	/*
-	 * Generate PMC IRQs:
-	 * (keep 'enabled' bit clear for now)
-	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
 	hwc->idx = -1;
 	hwc->last_cpu = -1;
 	hwc->last_tag = ~0ULL;
 
-	/*
-	 * Count user and OS events unless requested not to.
-	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+	/* Processor specifics */
+	if (x86_pmu.hw_config(attr, hwc))
+		return -EOPNOTSUPP;
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
@@ -517,7 +528,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 			return -EOPNOTSUPP;
 
 		/* BTS is currently only allowed for user-mode. */
-		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		if (!attr->exclude_kernel)
 			return -EOPNOTSUPP;
 	}
 
@@ -931,7 +942,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (n < 0)
 		return n;
 
-	ret = x86_schedule_events(cpuc, n, assign);
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
 		return ret;
 	/*
@@ -1263,7 +1274,7 @@ int hw_perf_group_sched_in(struct perf_event *leader,
 	if (n0 < 0)
 		return n0;
 
-	ret = x86_schedule_events(cpuc, n0, assign);
+	ret = x86_pmu.schedule_events(cpuc, n0, assign);
 	if (ret)
 		return ret;
 
@@ -1313,6 +1324,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_p4.c"
 #include "perf_event_intel_lbr.c"
 #include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
@@ -1515,7 +1527,7 @@ static int validate_group(struct perf_event *event)
 
 	fake_cpuc->n_events = n;
 
-	ret = x86_schedule_events(fake_cpuc, n, NULL);
+	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
 
 out_free:
 	kfree(fake_cpuc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 573458f1caf..358a8e3d05f 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -363,6 +363,8 @@ static __initconst struct x86_pmu amd_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 971dc6e7d54..044b8436b19 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -749,6 +749,8 @@ static __initconst struct x86_pmu core_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
@@ -786,6 +788,8 @@ static __initconst struct x86_pmu intel_pmu = {
 	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_event,
 	.disable		= intel_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
@@ -839,12 +843,13 @@ static __init int intel_pmu_init(void)
 	int version;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-		/* check for P6 processor family */
-	   if (boot_cpu_data.x86 == 6) {
-		return p6_pmu_init();
-	   } else {
+		switch (boot_cpu_data.x86) {
+		case 0x6:
+			return p6_pmu_init();
+		case 0xf:
+			return p4_pmu_init();
+		}
 		return -ENODEV;
-	   }
 	}
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 00000000000..381f593e829
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,607 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+#include <asm/perf_event_p4.h>
+
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_template {
+	u32 opcode;			/* ESCR event + CCCR selector */
+	u64 config;			/* packed predefined bits */
+	int dep;			/* upstream dependency event index */
+	unsigned int emask;		/* ESCR EventMask */
+	unsigned int escr_msr[2];	/* ESCR MSR for this event */
+	unsigned int cntr[2];		/* counter index (offset) */
+};
+
+struct p4_pmu_res {
+	/* maps hw_conf::idx into template for ESCR sake */
+	struct p4_event_template *tpl[ARCH_P4_MAX_CCCR];
+};
+
+static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+
+/*
+ * WARN: CCCR1 doesn't have a working enable bit so try to not
+ * use it if possible
+ *
+ * Also as only we start to support raw events we will need to
+ * append _all_ P4_EVENT_PACK'ed events here
+ */
+struct p4_event_template p4_templates[] = {
+	[0] = {
+		.opcode	= P4_UOP_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { 16, 17 },
+	},
+	[1] = {
+		.opcode	= P4_GLOBAL_POWER_EVENTS,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[2] = {
+		.opcode	= P4_INSTR_RETIRED,
+		.config	= 0,
+		.dep	= 0, /* needs front-end tagging */
+		.emask	=
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { 12, 14 },
+	},
+	[3] = {
+		.opcode	= P4_BSQ_CACHE_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[4] = {
+		.opcode	= P4_BSQ_CACHE_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { 0, 3 },
+	},
+	[5] = {
+		.opcode	= P4_RETIRED_BRANCH_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN)		|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
+		.cntr		= { 4, 6 },
+	},
+	[6] = {
+		.opcode	= P4_MISPRED_BRANCH_RETIRED,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { 12, 14 },
+	},
+	[7] = {
+		.opcode	= P4_FSB_DATA_ACTIVITY,
+		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
+			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+};
+
+static struct p4_event_template *p4_event_map[PERF_COUNT_HW_MAX] = {
+	/* non-halted CPU clocks */
+	[PERF_COUNT_HW_CPU_CYCLES]		= &p4_templates[1],
+
+	/* retired instructions: dep on tagging the FSB */
+	[PERF_COUNT_HW_INSTRUCTIONS]		= &p4_templates[2],
+
+	/* cache hits */
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= &p4_templates[3],
+
+	/* cache misses */
+	[PERF_COUNT_HW_CACHE_MISSES]		= &p4_templates[4],
+
+	/* branch instructions retired */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= &p4_templates[5],
+
+	/* mispredicted branches retired */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= &p4_templates[6],
+
+	/* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+	[PERF_COUNT_HW_BUS_CYCLES]		= &p4_templates[7],
+};
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+	struct p4_event_template *tpl;
+	u64 config;
+
+	if (hw_event > ARRAY_SIZE(p4_event_map)) {
+		printk_once(KERN_ERR "PMU: Incorrect event index\n");
+		return 0;
+	}
+	tpl = p4_event_map[hw_event];
+
+	/*
+	 * fill config up according to
+	 * a predefined event template
+	 */
+	config  = tpl->config;
+	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
+	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
+	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
+
+	/* on HT machine we need a special bit */
+	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
+		config = p4_set_ht_bit(config);
+
+	return config;
+}
+
+/*
+ * Note that we still have 5 events (from global events SDM list)
+ * intersected in opcode+emask bits so we will need another
+ * scheme there do distinguish templates.
+ */
+static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
+{
+	return dst & src;
+}
+
+static struct p4_event_template *p4_pmu_template_lookup(u64 config)
+{
+	u32 opcode = p4_config_unpack_opcode(config);
+	unsigned int emask = p4_config_unpack_emask(config);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(p4_templates); i++) {
+		if (opcode == p4_templates[i].opcode &&
+			p4_pmu_emask_match(emask, p4_templates[i].emask))
+			return &p4_templates[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * We don't control raw events so it's up to the caller
+ * to pass sane values (and we don't count the thread number
+ * on HT machine but allow HT-compatible specifics to be
+ * passed on)
+ */
+static u64 p4_pmu_raw_event(u64 hw_event)
+{
+	return hw_event &
+		(p4_config_pack_escr(P4_EVNTSEL_MASK_HT) |
+		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+}
+
+static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+{
+	int cpu = raw_smp_processor_id();
+
+	/*
+	 * the reason we use cpu that early is that: if we get scheduled
+	 * first time on the same cpu -- we will not need swap thread
+	 * specific flags in config (and will save some cpu cycles)
+	 */
+
+	/* CCCR by default */
+	hwc->config = p4_config_pack_cccr(p4_default_cccr_conf(cpu));
+
+	/* Count user and OS events unless not requested to */
+	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
+								attr->exclude_user));
+	return 0;
+}
+
+static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+	unsigned long dummy;
+
+	rdmsrl(hwc->config_base + hwc->idx, dummy);
+	if (dummy & P4_CCCR_OVF) {
+		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+			((u64)dummy) & ~P4_CCCR_OVF);
+	}
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * If event gets disabled while counter is in overflowed
+	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+	 * asserted again and again
+	 */
+	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+		(u64)(p4_config_unpack_cccr(hwc->config)) &
+			~P4_CCCR_ENABLE & ~P4_CCCR_OVF);
+}
+
+static void p4_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_disable_event(event);
+	}
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int thread = p4_ht_config_thread(hwc->config);
+	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+	u64 escr_base;
+	struct p4_event_template *tpl;
+	struct p4_pmu_res *c;
+
+	/*
+	 * some preparation work from per-cpu private fields
+	 * since we need to find out which ESCR to use
+	 */
+	c = &__get_cpu_var(p4_pmu_config);
+	tpl = c->tpl[hwc->idx];
+	if (!tpl) {
+		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
+		return;
+	}
+	escr_base = (u64)tpl->escr_msr[thread];
+
+	/*
+	 * - we dont support cascaded counters yet
+	 * - and counter 1 is broken (erratum)
+	 */
+	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+	WARN_ON_ONCE(hwc->idx == 1);
+
+	(void)checking_wrmsrl(escr_base, escr_conf);
+	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+		(u64)(p4_config_unpack_cccr(hwc->config)) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_enable_event(event);
+	}
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.addr = 0;
+	data.raw = NULL;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		WARN_ON_ONCE(hwc->idx != idx);
+
+		/*
+		 * FIXME: Redundant call, actually not needed
+		 * but just to check if we're screwed
+		 */
+		p4_pmu_clear_cccr_ovf(hwc);
+
+		val = x86_perf_event_update(event);
+		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled		= 1;
+		data.period	= event->hw.last_period;
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+		if (perf_event_overflow(event, 1, &data, regs))
+			p4_pmu_disable_event(event);
+	}
+
+	if (handled) {
+		/* p4 quirk: unmask it again */
+		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+		inc_irq_stat(apic_perf_irqs);
+	}
+
+	return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+	u32 escr, cccr;
+
+	/*
+	 * we either lucky and continue on same cpu or no HT support
+	 */
+	if (!p4_should_swap_ts(hwc->config, cpu))
+		return;
+
+	/*
+	 * the event is migrated from an another logical
+	 * cpu, so we need to swap thread specific flags
+	 */
+
+	escr = p4_config_unpack_escr(hwc->config);
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	if (p4_ht_thread(cpu)) {
+		cccr &= ~P4_CCCR_OVF_PMI_T0;
+		cccr |= P4_CCCR_OVF_PMI_T1;
+		if (escr & P4_EVNTSEL_T0_OS) {
+			escr &= ~P4_EVNTSEL_T0_OS;
+			escr |= P4_EVNTSEL_T1_OS;
+		}
+		if (escr & P4_EVNTSEL_T0_USR) {
+			escr &= ~P4_EVNTSEL_T0_USR;
+			escr |= P4_EVNTSEL_T1_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config |= P4_CONFIG_HT;
+	} else {
+		cccr &= ~P4_CCCR_OVF_PMI_T1;
+		cccr |= P4_CCCR_OVF_PMI_T0;
+		if (escr & P4_EVNTSEL_T1_OS) {
+			escr &= ~P4_EVNTSEL_T1_OS;
+			escr |= P4_EVNTSEL_T0_OS;
+		}
+		if (escr & P4_EVNTSEL_T1_USR) {
+			escr &= ~P4_EVNTSEL_T1_USR;
+			escr |= P4_EVNTSEL_T0_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config &= ~P4_CONFIG_HT;
+	}
+}
+
+/* ESCRs are not sequential in memory so we need a map */
+static unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
+	MSR_P4_ALF_ESCR0,	/*  0 */
+	MSR_P4_ALF_ESCR1,	/*  1 */
+	MSR_P4_BPU_ESCR0,	/*  2 */
+	MSR_P4_BPU_ESCR1,	/*  3 */
+	MSR_P4_BSU_ESCR0,	/*  4 */
+	MSR_P4_BSU_ESCR1,	/*  5 */
+	MSR_P4_CRU_ESCR0,	/*  6 */
+	MSR_P4_CRU_ESCR1,	/*  7 */
+	MSR_P4_CRU_ESCR2,	/*  8 */
+	MSR_P4_CRU_ESCR3,	/*  9 */
+	MSR_P4_CRU_ESCR4,	/* 10 */
+	MSR_P4_CRU_ESCR5,	/* 11 */
+	MSR_P4_DAC_ESCR0,	/* 12 */
+	MSR_P4_DAC_ESCR1,	/* 13 */
+	MSR_P4_FIRM_ESCR0,	/* 14 */
+	MSR_P4_FIRM_ESCR1,	/* 15 */
+	MSR_P4_FLAME_ESCR0,	/* 16 */
+	MSR_P4_FLAME_ESCR1,	/* 17 */
+	MSR_P4_FSB_ESCR0,	/* 18 */
+	MSR_P4_FSB_ESCR1,	/* 19 */
+	MSR_P4_IQ_ESCR0,	/* 20 */
+	MSR_P4_IQ_ESCR1,	/* 21 */
+	MSR_P4_IS_ESCR0,	/* 22 */
+	MSR_P4_IS_ESCR1,	/* 23 */
+	MSR_P4_ITLB_ESCR0,	/* 24 */
+	MSR_P4_ITLB_ESCR1,	/* 25 */
+	MSR_P4_IX_ESCR0,	/* 26 */
+	MSR_P4_IX_ESCR1,	/* 27 */
+	MSR_P4_MOB_ESCR0,	/* 28 */
+	MSR_P4_MOB_ESCR1,	/* 29 */
+	MSR_P4_MS_ESCR0,	/* 30 */
+	MSR_P4_MS_ESCR1,	/* 31 */
+	MSR_P4_PMH_ESCR0,	/* 32 */
+	MSR_P4_PMH_ESCR1,	/* 33 */
+	MSR_P4_RAT_ESCR0,	/* 34 */
+	MSR_P4_RAT_ESCR1,	/* 35 */
+	MSR_P4_SAAT_ESCR0,	/* 36 */
+	MSR_P4_SAAT_ESCR1,	/* 37 */
+	MSR_P4_SSU_ESCR0,	/* 38 */
+	MSR_P4_SSU_ESCR1,	/* 39 */
+	MSR_P4_TBPU_ESCR0,	/* 40 */
+	MSR_P4_TBPU_ESCR1,	/* 41 */
+	MSR_P4_TC_ESCR0,	/* 42 */
+	MSR_P4_TC_ESCR1,	/* 43 */
+	MSR_P4_U2L_ESCR0,	/* 44 */
+	MSR_P4_U2L_ESCR1,	/* 45 */
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(p4_escr_map); i++) {
+		if (addr == p4_escr_map[i])
+			return i;
+	}
+
+	return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
+
+	struct hw_perf_event *hwc;
+	struct p4_event_template *tpl;
+	struct p4_pmu_res *c;
+	int cpu = raw_smp_processor_id();
+	int escr_idx, thread, i, num;
+
+	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
+
+	c = &__get_cpu_var(p4_pmu_config);
+	/*
+	 * Firstly find out which resource events are going
+	 * to use, if ESCR+CCCR tuple is already borrowed
+	 * then get out of here
+	 */
+	for (i = 0, num = n; i < n; i++, num--) {
+		hwc = &cpuc->event_list[i]->hw;
+		tpl = p4_pmu_template_lookup(hwc->config);
+		if (!tpl)
+			goto done;
+		thread = p4_ht_thread(cpu);
+		escr_idx = p4_get_escr_idx(tpl->escr_msr[thread]);
+		if (escr_idx == -1)
+			goto done;
+
+		/* already allocated and remains on the same cpu */
+		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			if (assign)
+				assign[i] = hwc->idx;
+			/* upstream dependent event */
+			if (unlikely(tpl->dep != -1))
+				printk_once(KERN_WARNING "PMU: Dep events are "
+					"not implemented yet\n");
+			goto reserve;
+		}
+
+		/* it may be already borrowed */
+		if (test_bit(tpl->cntr[thread], used_mask) ||
+			test_bit(escr_idx, escr_mask))
+			goto done;
+
+		/*
+		 * ESCR+CCCR+COUNTERs are available to use lets swap
+		 * thread specific bits, push assigned bits
+		 * back and save template into per-cpu
+		 * area (which will allow us to find out the ESCR
+		 * to be used at moment of "enable event via real MSR")
+		 */
+		p4_pmu_swap_config_ts(hwc, cpu);
+		if (assign) {
+			assign[i] = tpl->cntr[thread];
+			c->tpl[assign[i]] = tpl;
+		}
+reserve:
+		set_bit(tpl->cntr[thread], used_mask);
+		set_bit(escr_idx, escr_mask);
+	}
+
+done:
+	return num ? -ENOSPC : 0;
+}
+
+static __initconst struct x86_pmu p4_pmu = {
+	.name			= "Netburst P4/Xeon",
+	.handle_irq		= p4_pmu_handle_irq,
+	.disable_all		= p4_pmu_disable_all,
+	.enable_all		= p4_pmu_enable_all,
+	.enable			= p4_pmu_enable_event,
+	.disable		= p4_pmu_disable_event,
+	.eventsel		= MSR_P4_BPU_CCCR0,
+	.perfctr		= MSR_P4_BPU_PERFCTR0,
+	.event_map		= p4_pmu_event_map,
+	.raw_event		= p4_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(p4_event_map),
+	.get_event_constraints	= x86_get_event_constraints,
+	/*
+	 * IF HT disabled we may need to use all
+	 * ARCH_P4_MAX_CCCR counters simulaneously
+	 * though leave it restricted at moment assuming
+	 * HT is on
+	 */
+	.num_events		= ARCH_P4_MAX_CCCR,
+	.apic			= 1,
+	.event_bits		= 40,
+	.event_mask		= (1ULL << 40) - 1,
+	.max_period		= (1ULL << 39) - 1,
+	.hw_config		= p4_hw_config,
+	.schedule_events	= p4_pmu_schedule_events,
+};
+
+static __init int p4_pmu_init(void)
+{
+	unsigned int low, high;
+
+	/* If we get stripped -- indexig fails */
+	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+	if (!(low & (1 << 7))) {
+		pr_cont("unsupported Netburst CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	pr_cont("Netburst events, ");
+
+	x86_pmu = p4_pmu;
+
+	return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14d..6ff4d01d880 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -109,6 +109,8 @@ static __initconst struct x86_pmu p6_pmu = {
 	.enable_all		= p6_pmu_enable_all,
 	.enable			= p6_pmu_enable_event,
 	.disable		= p6_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-- 
cgit v1.2.3-70-g09d2


From 0b861225a5890f22445f08ca9cc7a87cff276ff7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 12 Mar 2010 00:50:16 +0300
Subject: x86, perf: Fix NULL deref on not assigned x86_pmu

In case of not assigned x86_pmu and software events NULL dereference may
being hit via x86_pmu::schedule_events method.

Fix it by checking if x86_pmu is initialized at all.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100311215016.GG25162@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e6a3f5f81c9..5586a02067d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1269,6 +1269,9 @@ int hw_perf_group_sched_in(struct perf_event *leader,
 	int assign[X86_PMC_IDX_MAX];
 	int n0, n1, ret;
 
+	if (!x86_pmu_initialized())
+		return 0;
+
 	/* n0 = total number of events */
 	n0 = collect_events(cpuc, leader, true);
 	if (n0 < 0)
-- 
cgit v1.2.3-70-g09d2


From 8576e1971663ffdb6139041de97cdd2e1d4791cc Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 13 Mar 2010 11:11:16 +0300
Subject: x86, perf: Unmask LVTPC only if we have APIC supported

Ingo reported:

 |
 | There's a build failure on -tip with the P4 driver, on UP 32-bit, if
 | PERF_EVENTS is enabled but UP_APIC is disabled:
 |
 | arch/x86/built-in.o: In function `p4_pmu_handle_irq':
 | perf_event.c:(.text+0xa756): undefined reference to `apic'
 | perf_event.c:(.text+0xa76e): undefined reference to `apic'
 |

So we have to unmask LVTPC only if we're configured to have one.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100313081116.GA5179@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 381f593e829..ef861da1c8a 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -365,8 +365,10 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	}
 
 	if (handled) {
+#ifdef CONFIG_X86_LOCAL_APIC
 		/* p4 quirk: unmask it again */
 		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+#endif
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From e4495262826d1eabca3529fa6ac22394eb348132 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 15 Mar 2010 12:58:22 +0800
Subject: perf, x86: Enable not tagged retired instruction counting on P4s

This should turn on instruction counting on P4s, which was missing in
the first version of the new PMU driver.

It's inaccurate for now, we still need dependant event to tag mops
before we can count them precisely. The result is that the number of
instruction may be lifted up.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268629102.3355.11.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 8 ++++----
 arch/x86/kernel/cpu/perf_event_p4.c  | 8 +++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 829f4711645..b47b9e9ac13 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -324,8 +324,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
 	/*
-	 * MSR_P4_FIRM_ESCR:	8, 9
-	 * MSR_P4_FIRM_ESCR:	10, 11
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
 #define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
@@ -462,8 +462,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
 	/*
-	 * MSR_P4_CRU_ESCR2:	12, 13, 16
-	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
 #define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ef861da1c8a..a11ce73a93c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -60,13 +60,11 @@ struct p4_event_template p4_templates[] = {
 	[2] = {
 		.opcode	= P4_INSTR_RETIRED,
 		.config	= 0,
-		.dep	= 0, /* needs front-end tagging */
+		.dep	= -1, /* needs front-end tagging */
 		.emask	=
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
 	[3] = {
-- 
cgit v1.2.3-70-g09d2


From 1d199b1ad606ae8b88acebd295b101c4e1cf2a57 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 16 Mar 2010 01:05:02 +0100
Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs

perf_arch_fetch_caller_regs() is exported for the overriden x86
version, but not for the generic weak version.

As a general rule, weak functions should not have their symbol
exported in the same file they are defined.

So let's export it on trace_event_perf.c as it is used by trace
events only.

This fixes:

	ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined!
	ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined!

-v2: And also only build it if trace events are enabled.
-v3: Fix changelog mistake

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ++-
 kernel/perf_event.c              | 2 ++
 kernel/trace/trace_event_perf.c  | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 978d297170a..0d3466cf7f5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1695,6 +1695,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 	regs->ip = ip;
@@ -1706,4 +1707,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8bf61273c58..455393e71ca 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2790,10 +2790,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 __weak
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 }
+#endif
 
 /*
  * Output
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0709e4f7511..7d79a10c3cd 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -12,6 +12,8 @@
 DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
 EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
 
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
 
-- 
cgit v1.2.3-70-g09d2


From 984763cb90d4b5444baa0c3e43feff7926bf1834 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 16 Mar 2010 17:07:33 +0100
Subject: perf, x86: Report error code that returned from x86_pmu.hw_config()

If x86_pmu.hw_config() fails a fixed error code (-EOPNOTSUPP) is
returned even if a different error was reported. This patch fixes
this.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Lin Ming <ming.m.lin@intel.com>
Cc: acme@redhat.com
Cc: eranian@google.com
Cc: gorcunov@openvz.org
Cc: peterz@infradead.org
Cc: fweisbec@gmail.com
LKML-Reference: <20100316160733.GR1585@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0d3466cf7f5..5dacf63f913 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -472,8 +472,9 @@ static int __hw_perf_event_init(struct perf_event *event)
 	hwc->last_tag = ~0ULL;
 
 	/* Processor specifics */
-	if (x86_pmu.hw_config(attr, hwc))
-		return -EOPNOTSUPP;
+	err = x86_pmu.hw_config(attr, hwc);
+	if (err)
+		return err;
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
-- 
cgit v1.2.3-70-g09d2


From b27ea29c6267889be255f2217fa7a6106e6a8b04 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 17 Mar 2010 12:49:10 +0100
Subject: perf/core, x86: Reduce number of CONFIG_X86_LOCAL_APIC macros

The function reserve_pmc_hardware() and release_pmc_hardware()
were hard to read. This patch improves readability of the code by
removing most of the CONFIG_X86_LOCAL_APIC macros.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5dacf63f913..793e63f6c42 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -313,9 +313,10 @@ again:
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
+#ifdef CONFIG_X86_LOCAL_APIC
+
 static bool reserve_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -330,11 +331,9 @@ static bool reserve_pmc_hardware(void)
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
-#endif
 
 	return true;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -349,12 +348,10 @@ perfctr_fail:
 		enable_lapic_nmi_watchdog();
 
 	return false;
-#endif
 }
 
 static void release_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	for (i = 0; i < x86_pmu.num_events; i++) {
@@ -364,9 +361,15 @@ static void release_pmc_hardware(void)
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
-#endif
 }
 
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
 static int reserve_ds_buffers(void);
 static void release_ds_buffers(void);
 
-- 
cgit v1.2.3-70-g09d2


From 10f1014d86fd4fe5087080d609b51183396c5e4c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 17 Mar 2010 12:49:12 +0100
Subject: perf/core, x86: Remove cpu_hw_events.interrupts

This member in the struct is not used anymore and can be
removed.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 793e63f6c42..104292a58c2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -102,7 +102,6 @@ struct cpu_hw_events {
 	 */
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		interrupts;
 	int			enabled;
 
 	int			n_events;
-- 
cgit v1.2.3-70-g09d2


From d6dc0b4ead6e8720096ecfa3d9e899b47ddbc8ed Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 17 Mar 2010 12:49:13 +0100
Subject: perf/core, x86: Remove duplicate perf_event_mask variable

The same information is stored also in x86_pmu.intel_ctrl. This
patch removes perf_event_mask and instead uses
x86_pmu.intel_ctrl directly.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 104292a58c2..c97d5b52d12 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -75,8 +75,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	return len;
 }
 
-static u64 perf_event_mask __read_mostly;
-
 struct event_constraint {
 	union {
 		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -1406,7 +1404,7 @@ void __init init_hw_perf_events(void)
 		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
 		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
 	}
-	perf_event_mask = (1 << x86_pmu.num_events) - 1;
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_events) - 1;
 	perf_max_events = x86_pmu.num_events;
 
 	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
@@ -1415,9 +1413,8 @@ void __init init_hw_perf_events(void)
 		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
 	}
 
-	perf_event_mask |=
+	x86_pmu.intel_ctrl |=
 		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
-	x86_pmu.intel_ctrl = perf_event_mask;
 
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
@@ -1442,7 +1439,7 @@ void __init init_hw_perf_events(void)
 	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
 	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
-	pr_info("... event mask:             %016Lx\n", perf_event_mask);
+	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	perf_cpu_notifier(x86_pmu_notifier);
 }
-- 
cgit v1.2.3-70-g09d2


From 2acebe9ecb2b77876e87a1480729cfb2db4570dd Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 17 Mar 2010 10:40:38 -0500
Subject: x86, UV: Delete unneeded boot messages

SGI:UV: Delete extra boot messages that describe the system
topology. These messages are no longer useful.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100317154038.GA29346@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 3740c8a4eae..9a93b07fa14 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -736,9 +736,6 @@ void __init uv_system_init(void)
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
 		max_pnode = max(pnode, max_pnode);
-
-		printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
-			cpu, apicid, pnode, nid, lcpu, blade);
 	}
 
 	/* Add blade/pnode info for nodes without cpus */
-- 
cgit v1.2.3-70-g09d2


From 7335f75e9ca166044e38a96abad422d8e6e364b5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 17 Mar 2010 13:37:01 +0300
Subject: x86, perf: Use apic_write unconditionally

Since apic_write() maps to a plain noop in the !CONFIG_X86_LOCAL_APIC
case we're safe to remove this conditional compilation and clean up
the code a bit.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: fweisbec@gmail.com
Cc: acme@redhat.com
Cc: eranian@google.com
Cc: peterz@infradead.org
LKML-Reference: <20100317104356.232371479@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 4 ----
 arch/x86/kernel/cpu/perf_event_p4.c | 2 --
 2 files changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c97d5b52d12..14eca80918d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1136,7 +1136,6 @@ void set_perf_event_pending(void)
 
 void perf_events_lapic_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	if (!x86_pmu.apic || !x86_pmu_initialized())
 		return;
 
@@ -1144,7 +1143,6 @@ void perf_events_lapic_init(void)
 	 * Always use NMI for PMU
 	 */
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
 }
 
 static int __kprobes
@@ -1168,9 +1166,7 @@ perf_event_nmi_handler(struct notifier_block *self,
 
 	regs = args->regs;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
 	/*
 	 * Can't rely on the handled return value to say it was our NMI, two
 	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a11ce73a93c..0367889b4ae 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -363,10 +363,8 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	}
 
 	if (handled) {
-#ifdef CONFIG_X86_LOCAL_APIC
 		/* p4 quirk: unmask it again */
 		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-#endif
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From f34edbc1cdb0f8f83d94e1d668dd6e41abf0defb Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 18 Mar 2010 18:33:07 +0800
Subject: perf, x86: Add a key to simplify template lookup in Pentium-4 PMU

Currently, we use opcode(Event and Event-Selector) + emask to
look up template in p4_templates.

But cache events (L1-dcache-load-misses, LLC-load-misses, etc)
use the same event(P4_REPLAY_EVENT) to do the counting, ie, they
have the same opcode and emask. So we can not use current lookup
mechanism to find the template for cache events.

This patch introduces a "key", which is the index into
p4_templates. The low 12 bits of CCCR are reserved, so we can
hide the "key" in the low 12 bits of hwc->config.

We extract the key from hwc->config and then quickly find the
template.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268908387.13901.127.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h |  5 ++-
 arch/x86/kernel/cpu/perf_event_p4.c  | 86 ++++++++++++++----------------------
 2 files changed, 38 insertions(+), 53 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b842b3238e4..7d3406a2773 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -65,6 +65,7 @@
 #define P4_CCCR_THREAD_SINGLE		0x00010000U
 #define P4_CCCR_THREAD_BOTH		0x00020000U
 #define P4_CCCR_THREAD_ANY		0x00030000U
+#define P4_CCCR_RESERVED		0x00000fffU
 
 /* Non HT mask */
 #define P4_CCCR_MASK				\
@@ -116,7 +117,7 @@
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
 #define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
-#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xfffff000ULL)
 
 #define p4_config_unpack_emask(v)			\
 	({						\
@@ -126,6 +127,8 @@
 		t;					\
 	})
 
+#define p4_config_unpack_key(v)		(((u64)(v)) & P4_CCCR_RESERVED)
+
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 0367889b4ae..3e97ed3904c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,7 @@ struct p4_event_template {
 	u32 opcode;			/* ESCR event + CCCR selector */
 	u64 config;			/* packed predefined bits */
 	int dep;			/* upstream dependency event index */
+	int key;			/* index into p4_templates */
 	unsigned int emask;		/* ESCR EventMask */
 	unsigned int escr_msr[2];	/* ESCR MSR for this event */
 	unsigned int cntr[2];		/* counter index (offset) */
@@ -39,38 +40,31 @@ static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
  */
 struct p4_event_template p4_templates[] = {
 	[0] = {
-		.opcode	= P4_UOP_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.emask	=
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.cntr		= { 16, 17 },
-	},
-	[1] = {
 		.opcode	= P4_GLOBAL_POWER_EVENTS,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 0,
 		.emask	=
 			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[2] = {
+	[1] = {
 		.opcode	= P4_INSTR_RETIRED,
 		.config	= 0,
 		.dep	= -1, /* needs front-end tagging */
+		.key	= 1,
 		.emask	=
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
-	[3] = {
+	[2] = {
 		.opcode	= P4_BSQ_CACHE_REFERENCE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 2,
 		.emask	=
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
@@ -81,10 +75,11 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[4] = {
+	[3] = {
 		.opcode	= P4_BSQ_CACHE_REFERENCE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 3,
 		.emask	=
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
@@ -92,10 +87,11 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
 		.cntr		= { 0, 3 },
 	},
-	[5] = {
+	[4] = {
 		.opcode	= P4_RETIRED_BRANCH_TYPE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 4,
 		.emask	=
 			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
 			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
@@ -104,48 +100,38 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
 		.cntr		= { 4, 6 },
 	},
-	[6] = {
+	[5] = {
 		.opcode	= P4_MISPRED_BRANCH_RETIRED,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 5,
 		.emask	=
 			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
-	[7] = {
+	[6] = {
 		.opcode	= P4_FSB_DATA_ACTIVITY,
 		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
 		.dep	= -1,
+		.key	= 6,
 		.emask	=
 			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
 			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-};
-
-static struct p4_event_template *p4_event_map[PERF_COUNT_HW_MAX] = {
-	/* non-halted CPU clocks */
-	[PERF_COUNT_HW_CPU_CYCLES]		= &p4_templates[1],
-
-	/* retired instructions: dep on tagging the FSB */
-	[PERF_COUNT_HW_INSTRUCTIONS]		= &p4_templates[2],
-
-	/* cache hits */
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= &p4_templates[3],
-
-	/* cache misses */
-	[PERF_COUNT_HW_CACHE_MISSES]		= &p4_templates[4],
-
-	/* branch instructions retired */
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= &p4_templates[5],
-
-	/* mispredicted branches retired */
-	[PERF_COUNT_HW_BRANCH_MISSES]		= &p4_templates[6],
-
-	/* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-	[PERF_COUNT_HW_BUS_CYCLES]		= &p4_templates[7],
+	[7] = {
+		.opcode	= P4_UOP_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.key	= 7,
+		.emask	=
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { 16, 17 },
+	},
 };
 
 static u64 p4_pmu_event_map(int hw_event)
@@ -153,11 +139,11 @@ static u64 p4_pmu_event_map(int hw_event)
 	struct p4_event_template *tpl;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_event_map)) {
+	if (hw_event > ARRAY_SIZE(p4_templates)) {
 		printk_once(KERN_ERR "PMU: Incorrect event index\n");
 		return 0;
 	}
-	tpl = p4_event_map[hw_event];
+	tpl = &p4_templates[hw_event];
 
 	/*
 	 * fill config up according to
@@ -167,6 +153,7 @@ static u64 p4_pmu_event_map(int hw_event)
 	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
 	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
 	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
+	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
 
 	/* on HT machine we need a special bit */
 	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
@@ -187,17 +174,12 @@ static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
 
 static struct p4_event_template *p4_pmu_template_lookup(u64 config)
 {
-	u32 opcode = p4_config_unpack_opcode(config);
-	unsigned int emask = p4_config_unpack_emask(config);
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(p4_templates); i++) {
-		if (opcode == p4_templates[i].opcode &&
-			p4_pmu_emask_match(emask, p4_templates[i].emask))
-			return &p4_templates[i];
-	}
+	int key = p4_config_unpack_key(config);
 
-	return NULL;
+	if (key < ARRAY_SIZE(p4_templates))
+		return &p4_templates[key];
+	else
+		return NULL;
 }
 
 /*
@@ -564,7 +546,7 @@ static __initconst struct x86_pmu p4_pmu = {
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
 	.raw_event		= p4_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p4_event_map),
+	.max_events		= ARRAY_SIZE(p4_templates),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
 	 * IF HT disabled we may need to use all
-- 
cgit v1.2.3-70-g09d2


From cb7d6b5053e86598735d9af19930f5929f007b7f Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 18 Mar 2010 18:33:12 +0800
Subject: perf, x86: Add cache events for the Pentium-4 PMU

Move the HT bit setting code from p4_pmu_event_map to
p4_hw_config. So the cache events can get HT bit set correctly.

Tested on my P4 desktop, below 6 cache events work:

 L1-dcache-load-misses
 LLC-load-misses
 dTLB-load-misses
 dTLB-store-misses
 iTLB-loads
 iTLB-load-misses

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268908392.13901.128.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h     |   2 +
 arch/x86/include/asm/perf_event_p4.h |  10 +++
 arch/x86/kernel/cpu/perf_event_p4.c  | 153 +++++++++++++++++++++++++++++++++--
 3 files changed, 159 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cd58cdbc03..aef562c0a64 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -357,6 +357,8 @@
 #define MSR_P4_U2L_ESCR0		0x000003b0
 #define MSR_P4_U2L_ESCR1		0x000003b1
 
+#define MSR_P4_PEBS_MATRIX_VERT		0x000003f2
+
 /* Intel Core-based CPU performance counters */
 #define MSR_CORE_PERF_FIXED_CTR0	0x00000309
 #define MSR_CORE_PERF_FIXED_CTR1	0x0000030a
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 7d3406a2773..871249cf4d2 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -708,4 +708,14 @@ enum P4_EVENTS_ATTR {
 	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
 };
 
+enum {
+	KEY_P4_L1D_OP_READ_RESULT_MISS,
+	KEY_P4_LL_OP_READ_RESULT_MISS,
+	KEY_P4_DTLB_OP_READ_RESULT_MISS,
+	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+	KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+	KEY_P4_ITLB_OP_READ_RESULT_MISS,
+	KEY_P4_UOP_TYPE,
+};
+
 #endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3e97ed3904c..b7bf9911198 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -19,6 +19,11 @@ struct p4_event_template {
 	u64 config;			/* packed predefined bits */
 	int dep;			/* upstream dependency event index */
 	int key;			/* index into p4_templates */
+	u64 msr;			/*
+					 * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
+					 * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
+					 * for cache events
+					 */
 	unsigned int emask;		/* ESCR EventMask */
 	unsigned int escr_msr[2];	/* ESCR MSR for this event */
 	unsigned int cntr[2];		/* counter index (offset) */
@@ -31,6 +36,67 @@ struct p4_pmu_res {
 
 static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
 
+#define P4_CACHE_EVENT_CONFIG(event, bit) \
+	p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
+	p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
+	p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+static __initconst u64 p4_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* 1stL_cache_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_L1D_OP_READ_RESULT_MISS,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* 2ndL_cache_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_LL_OP_READ_RESULT_MISS,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* DTLB_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_DTLB_OP_READ_RESULT_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* DTLB_store_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+					/* ITLB_reference.HIT */
+		[ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
+					| KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+
+					/* ITLB_reference.MISS */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
+					| KEY_P4_ITLB_OP_READ_RESULT_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 /*
  * WARN: CCCR1 doesn't have a working enable bit so try to not
  * use it if possible
@@ -121,11 +187,77 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[7] = {
+	[KEY_P4_L1D_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_L1D_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_LL_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_LL_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_DTLB_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
+		.key	= KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
+		.opcode	= P4_ITLB_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= 0,
+		.key	= KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+		.emask	=
+			P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_ITLB_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= 0,
+		.key	= KEY_P4_ITLB_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[KEY_P4_UOP_TYPE] = {
 		.opcode	= P4_UOP_TYPE,
 		.config	= 0,
 		.dep	= -1,
-		.key	= 7,
+		.key	= KEY_P4_UOP_TYPE,
 		.emask	=
 			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
 			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
@@ -155,10 +287,6 @@ static u64 p4_pmu_event_map(int hw_event)
 	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
 	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
 
-	/* on HT machine we need a special bit */
-	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
-		config = p4_set_ht_bit(config);
-
 	return config;
 }
 
@@ -211,6 +339,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	/* Count user and OS events unless not requested to */
 	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
 								attr->exclude_user));
+	/* on HT machine we need a special bit */
+	if (p4_ht_active() && p4_ht_thread(cpu))
+		hwc->config = p4_set_ht_bit(hwc->config);
+
 	return 0;
 }
 
@@ -271,6 +403,12 @@ static void p4_pmu_enable_event(struct perf_event *event)
 		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
 		return;
 	}
+
+	if (tpl->msr) {
+		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
+		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
+	}
+
 	escr_base = (u64)tpl->escr_msr[thread];
 
 	/*
@@ -577,6 +715,9 @@ static __init int p4_pmu_init(void)
 		return -ENODEV;
 	}
 
+	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
 	pr_cont("Netburst events, ");
 
 	x86_pmu = p4_pmu;
-- 
cgit v1.2.3-70-g09d2


From 4b24a88b35e15e04bd8f2c5dda65b5dc8ebca05f Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 17 Mar 2010 23:21:01 +0200
Subject: perf_events: Fix resource leak in x86 __hw_perf_event_init()

If reserve_pmc_hardware() succeeds but reserve_ds_buffers()
fails, then we need to release_pmc_hardware. It won't be done
by the destroy() callback because we return before setting it
in case of error.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Cc: peterz@infradead.org
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: robert.richter@amd.com
Cc: perfmon2-devel@lists.sf.net
LKML-Reference: <4ba1568b.15185e0a.182a.7802@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--
 arch/x86/kernel/cpu/perf_event.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
---
 arch/x86/kernel/cpu/perf_event.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 14eca80918d..f571f514de2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -455,8 +455,11 @@ static int __hw_perf_event_init(struct perf_event *event)
 		if (atomic_read(&active_events) == 0) {
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
-			else
+			else {
 				err = reserve_ds_buffers();
+				if (err)
+					release_pmc_hardware();
+			}
 		}
 		if (!err)
 			atomic_inc(&active_events);
-- 
cgit v1.2.3-70-g09d2


From 9c8c6bad3137112d2c7bf3d215b736ee4215fa74 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 19 Mar 2010 00:12:56 +0300
Subject: x86, perf: Fix few cosmetic dabs for P4 pmu (comments and
 constantify)

- A few ESCR have escaped fixing at previous attempt.
- p4_escr_map is read only, make it const.

Nothing serious.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100318211256.GH5062@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 4 ++--
 arch/x86/kernel/cpu/perf_event_p4.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 871249cf4d2..2a1a57f7153 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -401,13 +401,13 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 #define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
-	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
 #define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
-	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
 #define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b7bf9911198..b8a811ab760 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -545,7 +545,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 }
 
 /* ESCRs are not sequential in memory so we need a map */
-static unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
+static const unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
 	MSR_P4_ALF_ESCR0,	/*  0 */
 	MSR_P4_ALF_ESCR1,	/*  1 */
 	MSR_P4_BPU_ESCR0,	/*  2 */
-- 
cgit v1.2.3-70-g09d2


From 4bd96a7a8185755b091233b16034c7436cbf57af Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Wed, 10 Mar 2010 14:36:10 +0800
Subject: x86, tboot: Add support for S3 memory integrity protection

This patch adds support for S3 memory integrity protection within an Intel(R)
TXT launched kernel, for all kernel and userspace memory.  All RAM used by the
kernel and userspace, as indicated by memory ranges of type E820_RAM and
E820_RESERVED_KERN in the e820 table, will be integrity protected.

The MAINTAINERS file is also updated to reflect the maintainers of the
TXT-related code.

All MACing is done in tboot, based on a complexity analysis and tradeoff.

v3: Compared with v2, this patch adds a check of array size in
tboot.c, and a note to specify which c/s of tboot supports this kind
of MACing in intel_txt.txt.

Signed-off-by: Shane Wang <shane.wang@intel.com>
LKML-Reference: <4B973DDA.6050902@intel.com>
Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/intel_txt.txt | 16 +++++++++-------
 MAINTAINERS                 | 11 +++++++++++
 arch/x86/include/asm/e820.h |  7 ++++++-
 arch/x86/kernel/tboot.c     | 20 +++++++++++---------
 4 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index f40a1f03001..87c8990dbbd 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -161,13 +161,15 @@ o  In order to put a system into any of the sleep states after a TXT
       has been restored, it will restore the TPM PCRs and then
       transfer control back to the kernel's S3 resume vector.
       In order to preserve system integrity across S3, the kernel
-      provides tboot with a set of memory ranges (kernel
-      code/data/bss, S3 resume code, and AP trampoline) that tboot
-      will calculate a MAC (message authentication code) over and then
-      seal with the TPM.  On resume and once the measured environment
-      has been re-established, tboot will re-calculate the MAC and
-      verify it against the sealed value.  Tboot's policy determines
-      what happens if the verification fails.
+      provides tboot with a set of memory ranges (RAM and RESERVED_KERN
+      in the e820 table, but not any memory that BIOS might alter over
+      the S3 transition) that tboot will calculate a MAC (message
+      authentication code) over and then seal with the TPM. On resume
+      and once the measured environment has been re-established, tboot
+      will re-calculate the MAC and verify it against the sealed value.
+      Tboot's policy determines what happens if the verification fails.
+      Note that the c/s 194 of tboot which has the new MAC code supports
+      this.
 
 That's pretty much it for TXT support.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 47cc449d89d..d3072cb8805 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2940,6 +2940,17 @@ S:	Odd Fixes
 F:	Documentation/networking/README.ipw2200
 F:	drivers/net/wireless/ipw2x00/ipw2200.*
 
+INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
+M:	Joseph Cihula <joseph.cihula@intel.com>
+M:	Shane Wang <shane.wang@intel.com>
+L:	tboot-devel@lists.sourceforge.net
+W:	http://tboot.sourceforge.net
+T:	Mercurial http://www.bughost.org/repos.hg/tboot.hg
+S:	Supported
+F:	Documentation/intel_txt.txt
+F:	include/linux/tboot.h
+F:	arch/x86/kernel/tboot.c
+
 INTEL WIRELESS WIMAX CONNECTION 2400
 M:	Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
 M:	linux-wimax@intel.com
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 0e22296790d..ec8a52d14ab 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -45,7 +45,12 @@
 #define E820_NVS	4
 #define E820_UNUSABLE	5
 
-/* reserved RAM used by kernel itself */
+/*
+ * reserved RAM used by kernel itself
+ * if CONFIG_INTEL_TXT is enabled, memory of this type will be
+ * included in the S3 integrity calculation and so should not include
+ * any memory that BIOS might alter over the S3 transition
+ */
 #define E820_RESERVED_KERN        128
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48a..cc2c60474fd 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -175,6 +175,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 	struct tboot_mac_region *mr;
 	phys_addr_t end = start + size;
 
+	if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+		panic("tboot: Too many MAC regions\n");
+
 	if (start && size) {
 		mr = &tboot->mac_regions[tboot->num_mac_regions++];
 		mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +187,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 
 static int tboot_setup_sleep(void)
 {
+	int i;
+
 	tboot->num_mac_regions = 0;
 
-	/* S3 resume code */
-	add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+	for (i = 0; i < e820.nr_map; i++) {
+		if ((e820.map[i].type != E820_RAM)
+		 && (e820.map[i].type != E820_RESERVED_KERN))
+			continue;
 
-#ifdef CONFIG_X86_TRAMPOLINE
-	/* AP trampoline code */
-	add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
-#endif
-
-	/* kernel code + data + bss */
-	add_mac_region(virt_to_phys(_text), _end - _text);
+		add_mac_region(e820.map[i].addr, e820.map[i].size);
+	}
 
 	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
 
-- 
cgit v1.2.3-70-g09d2


From d814f30105798b6677ecb73ed61d691ff96dada9 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 24 Mar 2010 12:09:26 +0800
Subject: x86, perf: Add raw events support for the P4 PMU

The adding of raw event support lead to complete code
refactoring. I hope is became more readable then it was.

The list of changes:

1)  The 64bit config field is enough to hold all information we need
    to track event details. To achieve it we used *own* enum for
    events selection in ESCR register and map this key into proper
    value at moment of event enabling.

    For the same reason we use 12LSB bits in CCCR register -- to track
    which exactly cache trace event was requested. And we cear this bits
    at real 'write' moment.

2)  There is no per-cpu area reserved for P4 PMU anymore. We
    don't need it. All is held by config.

3)  Now we may use any available counter, ie we try to grab any
    possible counter.

v2:
  - Lin Ming reported the lack of ESCR selector in CCCR for cache events

v3:
  - Don't loose cache event codes at config unpacking procedure, we may
    need it one day so no obscure hack behind our back, better to clear
    reserved bits explicitly when needed (thanks Ming for pointing out)

  - Lin Ming fixed misplaced opcodes in cache events

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1269403766.3409.6.camel@minggr.sh.intel.com>
[ v4: did a few whitespace fixlets ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 691 +++++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_p4.c  | 746 ++++++++++++++++++++---------------
 2 files changed, 806 insertions(+), 631 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index facf96186b2..b05400a542f 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -15,38 +15,40 @@
  * perf-MSRs are not shared and every thread has its
  * own perf-MSRs set)
  */
-#define ARCH_P4_TOTAL_ESCR		(46)
-#define ARCH_P4_RESERVED_ESCR		(2) /* IQ_ESCR(0,1) not always present */
-#define ARCH_P4_MAX_ESCR		(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
-#define ARCH_P4_MAX_CCCR		(18)
-#define ARCH_P4_MAX_COUNTER		(ARCH_P4_MAX_CCCR / 2)
-
-#define P4_EVNTSEL_EVENT_MASK		0x7e000000U
-#define P4_EVNTSEL_EVENT_SHIFT		25
-#define P4_EVNTSEL_EVENTMASK_MASK	0x01fffe00U
-#define P4_EVNTSEL_EVENTMASK_SHIFT	9
-#define P4_EVNTSEL_TAG_MASK		0x000001e0U
-#define P4_EVNTSEL_TAG_SHIFT		5
-#define P4_EVNTSEL_TAG_ENABLE		0x00000010U
-#define P4_EVNTSEL_T0_OS		0x00000008U
-#define P4_EVNTSEL_T0_USR		0x00000004U
-#define P4_EVNTSEL_T1_OS		0x00000002U
-#define P4_EVNTSEL_T1_USR		0x00000001U
+#define ARCH_P4_TOTAL_ESCR	(46)
+#define ARCH_P4_RESERVED_ESCR	(2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR	(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR	(18)
+#define ARCH_P4_MAX_COUNTER	(ARCH_P4_MAX_CCCR / 2)
+
+#define P4_ESCR_EVENT_MASK	0x7e000000U
+#define P4_ESCR_EVENT_SHIFT	25
+#define P4_ESCR_EVENTMASK_MASK	0x01fffe00U
+#define P4_ESCR_EVENTMASK_SHIFT	9
+#define P4_ESCR_TAG_MASK	0x000001e0U
+#define P4_ESCR_TAG_SHIFT	5
+#define P4_ESCR_TAG_ENABLE	0x00000010U
+#define P4_ESCR_T0_OS		0x00000008U
+#define P4_ESCR_T0_USR		0x00000004U
+#define P4_ESCR_T1_OS		0x00000002U
+#define P4_ESCR_T1_USR		0x00000001U
+
+#define P4_ESCR_EVENT(v)	((v) << P4_ESCR_EVENT_SHIFT)
+#define P4_ESCR_EMASK(v)	((v) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_TAG(v)		((v) << P4_ESCR_TAG_SHIFT)
 
 /* Non HT mask */
-#define P4_EVNTSEL_MASK				\
-	(P4_EVNTSEL_EVENT_MASK		|	\
-	P4_EVNTSEL_EVENTMASK_MASK	|	\
-	P4_EVNTSEL_TAG_MASK		|	\
-	P4_EVNTSEL_TAG_ENABLE		|	\
-	P4_EVNTSEL_T0_OS		|	\
-	P4_EVNTSEL_T0_USR)
+#define P4_ESCR_MASK			\
+	(P4_ESCR_EVENT_MASK	|	\
+	P4_ESCR_EVENTMASK_MASK	|	\
+	P4_ESCR_TAG_MASK	|	\
+	P4_ESCR_TAG_ENABLE	|	\
+	P4_ESCR_T0_OS		|	\
+	P4_ESCR_T0_USR)
 
 /* HT mask */
-#define P4_EVNTSEL_MASK_HT			\
-	(P4_EVNTSEL_MASK		|	\
-	P4_EVNTSEL_T1_OS		|	\
-	P4_EVNTSEL_T1_USR)
+#define P4_ESCR_MASK_HT			\
+	(P4_ESCR_MASK |	P4_ESCR_T1_OS | P4_ESCR_T1_USR)
 
 #define P4_CCCR_OVF			0x80000000U
 #define P4_CCCR_CASCADE			0x40000000U
@@ -56,7 +58,6 @@
 #define P4_CCCR_EDGE			0x01000000U
 #define P4_CCCR_THRESHOLD_MASK		0x00f00000U
 #define P4_CCCR_THRESHOLD_SHIFT		20
-#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
 #define P4_CCCR_COMPLEMENT		0x00080000U
 #define P4_CCCR_COMPARE			0x00040000U
 #define P4_CCCR_ESCR_SELECT_MASK	0x0000e000U
@@ -67,6 +68,13 @@
 #define P4_CCCR_THREAD_ANY		0x00030000U
 #define P4_CCCR_RESERVED		0x00000fffU
 
+#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_ESEL(v)			((v) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+/* Custom bits in reerved CCCR area */
+#define P4_CCCR_CACHE_OPS_MASK		0x0000003fU
+
+
 /* Non HT mask */
 #define P4_CCCR_MASK				\
 	(P4_CCCR_OVF			|	\
@@ -81,25 +89,11 @@
 	P4_CCCR_ENABLE)
 
 /* HT mask */
-#define P4_CCCR_MASK_HT				\
-	(P4_CCCR_MASK			|	\
-	P4_CCCR_THREAD_ANY)
+#define P4_CCCR_MASK_HT	(P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
 
-/*
- * format is 32 bit: ee ss aa aa
- * where
- *	ee - 8 bit event
- *	ss - 8 bit selector
- *	aa aa - 16 bits reserved for tags/attributes
- */
-#define P4_EVENT_PACK(event, selector)		(((event) << 24) | ((selector) << 16))
-#define P4_EVENT_UNPACK_EVENT(packed)		(((packed) >> 24) & 0xff)
-#define P4_EVENT_UNPACK_SELECTOR(packed)	(((packed) >> 16) & 0xff)
-#define P4_EVENT_PACK_ATTR(attr)		((attr))
-#define P4_EVENT_UNPACK_ATTR(packed)		((packed) & 0xffff)
-#define P4_MAKE_EVENT_ATTR(class, name, bit)	class##_##name = (1 << bit)
-#define P4_EVENT_ATTR(class, name)		class##_##name
-#define P4_EVENT_ATTR_STR(class, name)		__stringify(class##_##name)
+#define P4_GEN_ESCR_EMASK(class, name, bit)	\
+	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_EMASK_BIT(class, name)		class##__##name
 
 /*
  * config field is 64bit width and consists of
@@ -117,35 +111,29 @@
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
 #define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
-#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xfffff000ULL)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
 
 #define p4_config_unpack_emask(v)			\
 	({						\
 		u32 t = p4_config_unpack_escr((v));	\
-		t  &= P4_EVNTSEL_EVENTMASK_MASK;	\
-		t >>= P4_EVNTSEL_EVENTMASK_SHIFT;	\
+		t = t &  P4_ESCR_EVENTMASK_MASK;	\
+		t = t >> P4_ESCR_EVENTMASK_SHIFT;	\
+		t;					\
+	})
+
+#define p4_config_unpack_event(v)			\
+	({						\
+		u32 t = p4_config_unpack_escr((v));	\
+		t = t &  P4_ESCR_EVENT_MASK;		\
+		t = t >> P4_ESCR_EVENT_SHIFT;		\
 		t;					\
 	})
 
-#define p4_config_unpack_key(v)		(((u64)(v)) & P4_CCCR_RESERVED)
+#define p4_config_unpack_cache_event(v)	(((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
 
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
-static inline u32 p4_config_unpack_opcode(u64 config)
-{
-	u32 e, s;
-
-	/*
-	 * we don't care about HT presence here since
-	 * event opcode doesn't depend on it
-	 */
-	e = (p4_config_unpack_escr(config) & P4_EVNTSEL_EVENT_MASK) >> P4_EVNTSEL_EVENT_SHIFT;
-	s = (p4_config_unpack_cccr(config) & P4_CCCR_ESCR_SELECT_MASK) >> P4_CCCR_ESCR_SELECT_SHIFT;
-
-	return P4_EVENT_PACK(e, s);
-}
-
 static inline bool p4_is_event_cascaded(u64 config)
 {
 	u32 cccr = p4_config_unpack_cccr(config);
@@ -212,19 +200,73 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 	if (!p4_ht_thread(cpu)) {
 		if (!exclude_os)
-			escr |= P4_EVNTSEL_T0_OS;
+			escr |= P4_ESCR_T0_OS;
 		if (!exclude_usr)
-			escr |= P4_EVNTSEL_T0_USR;
+			escr |= P4_ESCR_T0_USR;
 	} else {
 		if (!exclude_os)
-			escr |= P4_EVNTSEL_T1_OS;
+			escr |= P4_ESCR_T1_OS;
 		if (!exclude_usr)
-			escr |= P4_EVNTSEL_T1_USR;
+			escr |= P4_ESCR_T1_USR;
 	}
 
 	return escr;
 }
 
+enum P4_EVENTS {
+	P4_EVENT_TC_DELIVER_MODE,
+	P4_EVENT_BPU_FETCH_REQUEST,
+	P4_EVENT_ITLB_REFERENCE,
+	P4_EVENT_MEMORY_CANCEL,
+	P4_EVENT_MEMORY_COMPLETE,
+	P4_EVENT_LOAD_PORT_REPLAY,
+	P4_EVENT_STORE_PORT_REPLAY,
+	P4_EVENT_MOB_LOAD_REPLAY,
+	P4_EVENT_PAGE_WALK_TYPE,
+	P4_EVENT_BSQ_CACHE_REFERENCE,
+	P4_EVENT_IOQ_ALLOCATION,
+	P4_EVENT_IOQ_ACTIVE_ENTRIES,
+	P4_EVENT_FSB_DATA_ACTIVITY,
+	P4_EVENT_BSQ_ALLOCATION,
+	P4_EVENT_BSQ_ACTIVE_ENTRIES,
+	P4_EVENT_SSE_INPUT_ASSIST,
+	P4_EVENT_PACKED_SP_UOP,
+	P4_EVENT_PACKED_DP_UOP,
+	P4_EVENT_SCALAR_SP_UOP,
+	P4_EVENT_SCALAR_DP_UOP,
+	P4_EVENT_64BIT_MMX_UOP,
+	P4_EVENT_128BIT_MMX_UOP,
+	P4_EVENT_X87_FP_UOP,
+	P4_EVENT_TC_MISC,
+	P4_EVENT_GLOBAL_POWER_EVENTS,
+	P4_EVENT_TC_MS_XFER,
+	P4_EVENT_UOP_QUEUE_WRITES,
+	P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
+	P4_EVENT_RETIRED_BRANCH_TYPE,
+	P4_EVENT_RESOURCE_STALL,
+	P4_EVENT_WC_BUFFER,
+	P4_EVENT_B2B_CYCLES,
+	P4_EVENT_BNR,
+	P4_EVENT_SNOOP,
+	P4_EVENT_RESPONSE,
+	P4_EVENT_FRONT_END_EVENT,
+	P4_EVENT_EXECUTION_EVENT,
+	P4_EVENT_REPLAY_EVENT,
+	P4_EVENT_INSTR_RETIRED,
+	P4_EVENT_UOPS_RETIRED,
+	P4_EVENT_UOP_TYPE,
+	P4_EVENT_BRANCH_RETIRED,
+	P4_EVENT_MISPRED_BRANCH_RETIRED,
+	P4_EVENT_X87_ASSIST,
+	P4_EVENT_MACHINE_CLEAR,
+	P4_EVENT_INSTR_COMPLETED,
+};
+
+#define P4_OPCODE(event)		event##_OPCODE
+#define P4_OPCODE_ESEL(opcode)		((opcode & 0x00ff) >> 0)
+#define P4_OPCODE_EVNT(opcode)		((opcode & 0xff00) >> 8)
+#define P4_OPCODE_PACK(event, sel)	(((event) << 8) | sel)
+
 /*
  * Comments below the event represent ESCR restriction
  * for this event and counter index per ESCR
@@ -238,484 +280,515 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
  * working so that we should not use this CCCR and respective
  * counter as result
  */
-#define P4_TC_DELIVER_MODE		P4_EVENT_PACK(0x01, 0x01)
+enum P4_EVENT_OPCODES {
+	P4_OPCODE(P4_EVENT_TC_DELIVER_MODE)		= P4_OPCODE_PACK(0x01, 0x01),
 	/*
 	 * MSR_P4_TC_ESCR0:	4, 5
 	 * MSR_P4_TC_ESCR1:	6, 7
 	 */
 
-#define P4_BPU_FETCH_REQUEST		P4_EVENT_PACK(0x03, 0x00)
+	P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST)		= P4_OPCODE_PACK(0x03, 0x00),
 	/*
 	 * MSR_P4_BPU_ESCR0:	0, 1
 	 * MSR_P4_BPU_ESCR1:	2, 3
 	 */
 
-#define P4_ITLB_REFERENCE		P4_EVENT_PACK(0x18, 0x03)
+	P4_OPCODE(P4_EVENT_ITLB_REFERENCE)		= P4_OPCODE_PACK(0x18, 0x03),
 	/*
 	 * MSR_P4_ITLB_ESCR0:	0, 1
 	 * MSR_P4_ITLB_ESCR1:	2, 3
 	 */
 
-#define P4_MEMORY_CANCEL		P4_EVENT_PACK(0x02, 0x05)
+	P4_OPCODE(P4_EVENT_MEMORY_CANCEL)		= P4_OPCODE_PACK(0x02, 0x05),
 	/*
 	 * MSR_P4_DAC_ESCR0:	8, 9
 	 * MSR_P4_DAC_ESCR1:	10, 11
 	 */
 
-#define P4_MEMORY_COMPLETE		P4_EVENT_PACK(0x08, 0x02)
+	P4_OPCODE(P4_EVENT_MEMORY_COMPLETE)		= P4_OPCODE_PACK(0x08, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_LOAD_PORT_REPLAY		P4_EVENT_PACK(0x04, 0x02)
+	P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY)		= P4_OPCODE_PACK(0x04, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_STORE_PORT_REPLAY		P4_EVENT_PACK(0x05, 0x02)
+	P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY)		= P4_OPCODE_PACK(0x05, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_MOB_LOAD_REPLAY		P4_EVENT_PACK(0x03, 0x02)
+	P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY)		= P4_OPCODE_PACK(0x03, 0x02),
 	/*
 	 * MSR_P4_MOB_ESCR0:	0, 1
 	 * MSR_P4_MOB_ESCR1:	2, 3
 	 */
 
-#define P4_PAGE_WALK_TYPE		P4_EVENT_PACK(0x01, 0x04)
+	P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE)		= P4_OPCODE_PACK(0x01, 0x04),
 	/*
 	 * MSR_P4_PMH_ESCR0:	0, 1
 	 * MSR_P4_PMH_ESCR1:	2, 3
 	 */
 
-#define P4_BSQ_CACHE_REFERENCE		P4_EVENT_PACK(0x0c, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE)		= P4_OPCODE_PACK(0x0c, 0x07),
 	/*
 	 * MSR_P4_BSU_ESCR0:	0, 1
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
-#define P4_IOQ_ALLOCATION		P4_EVENT_PACK(0x03, 0x06)
+	P4_OPCODE(P4_EVENT_IOQ_ALLOCATION)		= P4_OPCODE_PACK(0x03, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_IOQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x1a, 0x06)
+	P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES)		= P4_OPCODE_PACK(0x1a, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_FSB_DATA_ACTIVITY		P4_EVENT_PACK(0x17, 0x06)
+	P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY)		= P4_OPCODE_PACK(0x17, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_BSQ_ALLOCATION		P4_EVENT_PACK(0x05, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_ALLOCATION)		= P4_OPCODE_PACK(0x05, 0x07),
 	/*
 	 * MSR_P4_BSU_ESCR0:	0, 1
 	 */
 
-#define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES)		= P4_OPCODE_PACK(0x06, 0x07),
 	/*
 	 * NOTE: no ESCR name in docs, it's guessed
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
-#define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
+	P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST)		= P4_OPCODE_PACK(0x34, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
+	P4_OPCODE(P4_EVENT_PACKED_SP_UOP)		= P4_OPCODE_PACK(0x08, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_PACKED_DP_UOP		P4_EVENT_PACK(0x0c, 0x01)
+	P4_OPCODE(P4_EVENT_PACKED_DP_UOP)		= P4_OPCODE_PACK(0x0c, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_SCALAR_SP_UOP		P4_EVENT_PACK(0x0a, 0x01)
+	P4_OPCODE(P4_EVENT_SCALAR_SP_UOP)		= P4_OPCODE_PACK(0x0a, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_SCALAR_DP_UOP		P4_EVENT_PACK(0x0e, 0x01)
+	P4_OPCODE(P4_EVENT_SCALAR_DP_UOP)		= P4_OPCODE_PACK(0x0e, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_64BIT_MMX_UOP		P4_EVENT_PACK(0x02, 0x01)
+	P4_OPCODE(P4_EVENT_64BIT_MMX_UOP)		= P4_OPCODE_PACK(0x02, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_128BIT_MMX_UOP		P4_EVENT_PACK(0x1a, 0x01)
+	P4_OPCODE(P4_EVENT_128BIT_MMX_UOP)		= P4_OPCODE_PACK(0x1a, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_X87_FP_UOP			P4_EVENT_PACK(0x04, 0x01)
+	P4_OPCODE(P4_EVENT_X87_FP_UOP)			= P4_OPCODE_PACK(0x04, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_TC_MISC			P4_EVENT_PACK(0x06, 0x01)
+	P4_OPCODE(P4_EVENT_TC_MISC)			= P4_OPCODE_PACK(0x06, 0x01),
 	/*
 	 * MSR_P4_TC_ESCR0:	4, 5
 	 * MSR_P4_TC_ESCR1:	6, 7
 	 */
 
-#define P4_GLOBAL_POWER_EVENTS		P4_EVENT_PACK(0x13, 0x06)
+	P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS)		= P4_OPCODE_PACK(0x13, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_TC_MS_XFER			P4_EVENT_PACK(0x05, 0x00)
+	P4_OPCODE(P4_EVENT_TC_MS_XFER)			= P4_OPCODE_PACK(0x05, 0x00),
 	/*
 	 * MSR_P4_MS_ESCR0:	4, 5
 	 * MSR_P4_MS_ESCR1:	6, 7
 	 */
 
-#define P4_UOP_QUEUE_WRITES		P4_EVENT_PACK(0x09, 0x00)
+	P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES)		= P4_OPCODE_PACK(0x09, 0x00),
 	/*
 	 * MSR_P4_MS_ESCR0:	4, 5
 	 * MSR_P4_MS_ESCR1:	6, 7
 	 */
 
-#define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
+	P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE)	= P4_OPCODE_PACK(0x05, 0x02),
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
 	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
-#define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
+	P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE)		= P4_OPCODE_PACK(0x04, 0x02),
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
 	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
-#define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
+	P4_OPCODE(P4_EVENT_RESOURCE_STALL)		= P4_OPCODE_PACK(0x01, 0x01),
 	/*
 	 * MSR_P4_ALF_ESCR0:	12, 13, 16
 	 * MSR_P4_ALF_ESCR1:	14, 15, 17
 	 */
 
-#define P4_WC_BUFFER			P4_EVENT_PACK(0x05, 0x05)
+	P4_OPCODE(P4_EVENT_WC_BUFFER)			= P4_OPCODE_PACK(0x05, 0x05),
 	/*
 	 * MSR_P4_DAC_ESCR0:	8, 9
 	 * MSR_P4_DAC_ESCR1:	10, 11
 	 */
 
-#define P4_B2B_CYCLES			P4_EVENT_PACK(0x16, 0x03)
+	P4_OPCODE(P4_EVENT_B2B_CYCLES)			= P4_OPCODE_PACK(0x16, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_BNR				P4_EVENT_PACK(0x08, 0x03)
+	P4_OPCODE(P4_EVENT_BNR)				= P4_OPCODE_PACK(0x08, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_SNOOP			P4_EVENT_PACK(0x06, 0x03)
+	P4_OPCODE(P4_EVENT_SNOOP)			= P4_OPCODE_PACK(0x06, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_RESPONSE			P4_EVENT_PACK(0x04, 0x03)
+	P4_OPCODE(P4_EVENT_RESPONSE)			= P4_OPCODE_PACK(0x04, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_FRONT_END_EVENT		P4_EVENT_PACK(0x08, 0x05)
+	P4_OPCODE(P4_EVENT_FRONT_END_EVENT)		= P4_OPCODE_PACK(0x08, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_EXECUTION_EVENT		P4_EVENT_PACK(0x0c, 0x05)
+	P4_OPCODE(P4_EVENT_EXECUTION_EVENT)		= P4_OPCODE_PACK(0x0c, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_REPLAY_EVENT			P4_EVENT_PACK(0x09, 0x05)
+	P4_OPCODE(P4_EVENT_REPLAY_EVENT)		= P4_OPCODE_PACK(0x09, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
+	P4_OPCODE(P4_EVENT_INSTR_RETIRED)		= P4_OPCODE_PACK(0x02, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
+	P4_OPCODE(P4_EVENT_UOPS_RETIRED)		= P4_OPCODE_PACK(0x01, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
+	P4_OPCODE(P4_EVENT_UOP_TYPE)			= P4_OPCODE_PACK(0x02, 0x02),
 	/*
 	 * MSR_P4_RAT_ESCR0:	12, 13, 16
 	 * MSR_P4_RAT_ESCR1:	14, 15, 17
 	 */
 
-#define P4_BRANCH_RETIRED		P4_EVENT_PACK(0x06, 0x05)
+	P4_OPCODE(P4_EVENT_BRANCH_RETIRED)		= P4_OPCODE_PACK(0x06, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_MISPRED_BRANCH_RETIRED	P4_EVENT_PACK(0x03, 0x04)
+	P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED)	= P4_OPCODE_PACK(0x03, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_X87_ASSIST			P4_EVENT_PACK(0x03, 0x05)
+	P4_OPCODE(P4_EVENT_X87_ASSIST)			= P4_OPCODE_PACK(0x03, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_MACHINE_CLEAR		P4_EVENT_PACK(0x02, 0x05)
+	P4_OPCODE(P4_EVENT_MACHINE_CLEAR)		= P4_OPCODE_PACK(0x02, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_INSTR_COMPLETED		P4_EVENT_PACK(0x07, 0x04)
+	P4_OPCODE(P4_EVENT_INSTR_COMPLETED)		= P4_OPCODE_PACK(0x07, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
+};
 
 /*
- * a caller should use P4_EVENT_ATTR helper to
- * pick the attribute needed, for example
+ * a caller should use P4_ESCR_EMASK_NAME helper to
+ * pick the EventMask needed, for example
  *
- *	P4_EVENT_ATTR(P4_TC_DELIVER_MODE, DD)
+ *	P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
  */
-enum P4_EVENTS_ATTR {
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DD, 0),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DB, 1),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DI, 2),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BD, 3),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BB, 4),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BI, 5),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, ID, 6),
-
-	P4_MAKE_EVENT_ATTR(P4_BPU_FETCH_REQUEST, TCMISS, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT, 0),
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, MISS, 1),
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT_UK, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, ST_RB_FULL, 2),
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, 64K_CONF, 3),
-
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, LSC, 0),
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, SSC, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_LOAD_PORT_REPLAY, SPLIT_LD, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_STORE_PORT_REPLAY, SPLIT_ST, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STA, 1),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STD, 3),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, DTMISS, 0),
-	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, ITMISS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
-
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, DEFAULT, 0),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_READ, 5),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_WRITE, 6),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_UC, 7),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WC, 8),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WT, 9),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WP, 10),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WB, 11),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OWN, 13),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OTHER, 14),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, PREFETCH, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OWN, 13),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OTHER, 14),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE0, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE1, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN0, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN1, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE0, 11),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE1, 12),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE2, 13),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
-
-	P4_MAKE_EVENT_ATTR(P4_SSE_INPUT_ASSIST, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_PACKED_SP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_PACKED_DP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_SCALAR_SP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_SCALAR_DP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_64BIT_MMX_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_128BIT_MMX_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_X87_FP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_TC_MISC, FLUSH, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_TC_MS_XFER, CISC, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_ROM, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL, 2),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN, 3),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_RESOURCE_STALL, SBFULL, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_EVICTS, 0),
-	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_FULL_EVICTS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, BOGUS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS0, 0),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS1, 1),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS2, 2),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS3, 3),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS0, 4),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS1, 5),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS2, 6),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS3, 7),
-
-	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, BOGUS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG, 0),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG, 1),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG, 2),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG, 3),
-
-	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, BOGUS, 1),
+enum P4_ESCR_EMASKS {
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
 
-	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS, 1),
-	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNP, 0),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNM, 1),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTP, 2),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTM, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
 
-	P4_MAKE_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
 
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSU, 0),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSO, 1),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAO, 2),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAU, 3),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, PREA, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
 
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, CLEAR, 0),
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, MOCLEAR, 1),
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, SMCLEAR, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
 
-	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
 };
 
-enum {
-	KEY_P4_L1D_OP_READ_RESULT_MISS = PERF_COUNT_HW_MAX,
-	KEY_P4_LL_OP_READ_RESULT_MISS,
-	KEY_P4_DTLB_OP_READ_RESULT_MISS,
-	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-	KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-	KEY_P4_ITLB_OP_READ_RESULT_MISS,
-	KEY_P4_UOP_TYPE,
+/* P4 PEBS: stale for a while */
+#define P4_PEBS_METRIC_MASK	0x00001fffU
+#define P4_PEBS_UOB_TAG		0x01000000U
+#define P4_PEBS_ENABLE		0x02000000U
+
+/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
+#define P4_PEBS__1stl_cache_load_miss_retired	0x3000001
+#define P4_PEBS__2ndl_cache_load_miss_retired	0x3000002
+#define P4_PEBS__dtlb_load_miss_retired		0x3000004
+#define P4_PEBS__dtlb_store_miss_retired	0x3000004
+#define P4_PEBS__dtlb_all_miss_retired		0x3000004
+#define P4_PEBS__tagged_mispred_branch		0x3018000
+#define P4_PEBS__mob_load_replay_retired	0x3000200
+#define P4_PEBS__split_load_retired		0x3000400
+#define P4_PEBS__split_store_retired		0x3000400
+
+#define P4_VERT__1stl_cache_load_miss_retired	0x0000001
+#define P4_VERT__2ndl_cache_load_miss_retired	0x0000001
+#define P4_VERT__dtlb_load_miss_retired		0x0000001
+#define P4_VERT__dtlb_store_miss_retired	0x0000002
+#define P4_VERT__dtlb_all_miss_retired		0x0000003
+#define P4_VERT__tagged_mispred_branch		0x0000010
+#define P4_VERT__mob_load_replay_retired	0x0000001
+#define P4_VERT__split_load_retired		0x0000001
+#define P4_VERT__split_store_retired		0x0000002
+
+enum P4_CACHE_EVENTS {
+	P4_CACHE__NONE,
+
+	P4_CACHE__1stl_cache_load_miss_retired,
+	P4_CACHE__2ndl_cache_load_miss_retired,
+	P4_CACHE__dtlb_load_miss_retired,
+	P4_CACHE__dtlb_store_miss_retired,
+	P4_CACHE__itlb_reference_hit,
+	P4_CACHE__itlb_reference_miss,
+
+	P4_CACHE__MAX
 };
 
 #endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b8a811ab760..f8fe069f14e 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -11,35 +11,281 @@
 
 #include <asm/perf_event_p4.h>
 
+#define P4_CNTR_LIMIT 3
 /*
  * array indices: 0,1 - HT threads, used with HT enabled cpu
  */
-struct p4_event_template {
-	u32 opcode;			/* ESCR event + CCCR selector */
-	u64 config;			/* packed predefined bits */
-	int dep;			/* upstream dependency event index */
-	int key;			/* index into p4_templates */
-	u64 msr;			/*
-					 * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
-					 * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
-					 * for cache events
-					 */
-	unsigned int emask;		/* ESCR EventMask */
-	unsigned int escr_msr[2];	/* ESCR MSR for this event */
-	unsigned int cntr[2];		/* counter index (offset) */
+struct p4_event_bind {
+	unsigned int opcode;			/* Event code and ESCR selector */
+	unsigned int escr_msr[2];		/* ESCR MSR for this event */
+	unsigned char cntr[2][P4_CNTR_LIMIT];	/* counter index (offset), -1 on abscence */
 };
 
-struct p4_pmu_res {
-	/* maps hw_conf::idx into template for ESCR sake */
-	struct p4_event_template *tpl[ARCH_P4_MAX_CCCR];
+struct p4_cache_event_bind {
+	unsigned int metric_pebs;
+	unsigned int metric_vert;
 };
 
-static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+#define P4_GEN_CACHE_EVENT_BIND(name)		\
+	[P4_CACHE__##name] = {			\
+		.metric_pebs = P4_PEBS__##name,	\
+		.metric_vert = P4_VERT__##name,	\
+	}
+
+static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
+	P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+	[P4_EVENT_TC_DELIVER_MODE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_BPU_FETCH_REQUEST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_ITLB_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_MEMORY_CANCEL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MEMORY_COMPLETE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_LOAD_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_STORE_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MOB_LOAD_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_PAGE_WALK_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ALLOCATION] = {
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_FSB_DATA_ACTIVITY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+		.cntr		= { {0, -1, -1}, {1, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_SSE_INPUT_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_64BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_128BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_X87_FP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_TC_MISC] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
+		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_TC_MS_XFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_UOP_QUEUE_WRITES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RESOURCE_STALL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_WC_BUFFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_B2B_CYCLES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BNR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BNR),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_SNOOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_RESPONSE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_FRONT_END_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_EXECUTION_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_REPLAY_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOPS_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOP_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_X87_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MACHINE_CLEAR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_COMPLETED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+};
 
-#define P4_CACHE_EVENT_CONFIG(event, bit) \
-	p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
-	p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
-	p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+#define P4_GEN_CACHE_EVENT(event, bit, cache_event)			  \
+	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
+			    P4_ESCR_EMASK_BIT(event, bit))		| \
+	p4_config_pack_cccr(cache_event					| \
+			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
 static __initconst u64 p4_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
@@ -49,42 +295,35 @@ static __initconst u64 p4_hw_cache_event_ids
  [ C(L1D ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* 1stL_cache_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_L1D_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__1stl_cache_load_miss_retired),
 	},
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* 2ndL_cache_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_LL_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__2ndl_cache_load_miss_retired),
 	},
- },
+},
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* DTLB_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_DTLB_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__dtlb_load_miss_retired),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* DTLB_store_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__dtlb_store_miss_retired),
 	},
  },
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
-					/* ITLB_reference.HIT */
-		[ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
-					| KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-
-					/* ITLB_reference.MISS */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
-					| KEY_P4_ITLB_OP_READ_RESULT_MISS,
+		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+						P4_CACHE__itlb_reference_hit),
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+						P4_CACHE__itlb_reference_miss),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -97,219 +336,89 @@ static __initconst u64 p4_hw_cache_event_ids
  },
 };
 
-/*
- * WARN: CCCR1 doesn't have a working enable bit so try to not
- * use it if possible
- *
- * Also as only we start to support raw events we will need to
- * append _all_ P4_EVENT_PACK'ed events here
- */
-struct p4_event_template p4_templates[] = {
-	[0] = {
-		.opcode	= P4_GLOBAL_POWER_EVENTS,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 0,
-		.emask	=
-			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[1] = {
-		.opcode	= P4_INSTR_RETIRED,
-		.config	= 0,
-		.dep	= -1, /* needs front-end tagging */
-		.key	= 1,
-		.emask	=
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.cntr		= { 12, 14 },
-	},
-	[2] = {
-		.opcode	= P4_BSQ_CACHE_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 2,
-		.emask	=
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[3] = {
-		.opcode	= P4_BSQ_CACHE_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 3,
-		.emask	=
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.cntr		= { 0, 3 },
-	},
-	[4] = {
-		.opcode	= P4_RETIRED_BRANCH_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 4,
-		.emask	=
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN)		|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT),
-		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
-		.cntr		= { 4, 6 },
-	},
-	[5] = {
-		.opcode	= P4_MISPRED_BRANCH_RETIRED,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 5,
-		.emask	=
-			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.cntr		= { 12, 14 },
-	},
-	[6] = {
-		.opcode	= P4_FSB_DATA_ACTIVITY,
-		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-		.dep	= -1,
-		.key	= 6,
-		.emask	=
-			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
-			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_L1D_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_L1D_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_LL_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_LL_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_DTLB_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
-		.key	= KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
-		.opcode	= P4_ITLB_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= 0,
-		.key	= KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-		.emask	=
-			P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_ITLB_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= 0,
-		.key	= KEY_P4_ITLB_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_UOP_TYPE] = {
-		.opcode	= P4_UOP_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= KEY_P4_UOP_TYPE,
-		.emask	=
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.cntr		= { 16, 17 },
-	},
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]	=
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))	|
+	p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
 };
 
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+	unsigned int evnt = p4_config_unpack_event(config);
+	struct p4_event_bind *bind = NULL;
+
+	if (evnt < ARRAY_SIZE(p4_event_bind_map))
+		bind = &p4_event_bind_map[evnt];
+
+	return bind;
+}
+
 static u64 p4_pmu_event_map(int hw_event)
 {
-	struct p4_event_template *tpl;
+	struct p4_event_bind *bind;
+	unsigned int esel;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_templates)) {
-		printk_once(KERN_ERR "PMU: Incorrect event index\n");
+	if (hw_event > ARRAY_SIZE(p4_general_events)) {
+		printk_once(KERN_ERR "P4 PMU: Bad index: %i\n", hw_event);
 		return 0;
 	}
-	tpl = &p4_templates[hw_event];
 
-	/*
-	 * fill config up according to
-	 * a predefined event template
-	 */
-	config  = tpl->config;
-	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
-	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
-	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
-	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
+	config = p4_general_events[hw_event];
+	bind = p4_config_get_bind(config);
+	esel = P4_OPCODE_ESEL(bind->opcode);
+	config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
 
 	return config;
 }
 
-/*
- * Note that we still have 5 events (from global events SDM list)
- * intersected in opcode+emask bits so we will need another
- * scheme there do distinguish templates.
- */
-static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
-{
-	return dst & src;
-}
-
-static struct p4_event_template *p4_pmu_template_lookup(u64 config)
-{
-	int key = p4_config_unpack_key(config);
-
-	if (key < ARRAY_SIZE(p4_templates))
-		return &p4_templates[key];
-	else
-		return NULL;
-}
-
 /*
  * We don't control raw events so it's up to the caller
  * to pass sane values (and we don't count the thread number
@@ -319,13 +428,14 @@ static struct p4_event_template *p4_pmu_template_lookup(u64 config)
 static u64 p4_pmu_raw_event(u64 hw_event)
 {
 	return hw_event &
-		(p4_config_pack_escr(P4_EVNTSEL_MASK_HT) |
+		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
 		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 }
 
 static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 {
 	int cpu = raw_smp_processor_id();
+	u32 escr, cccr;
 
 	/*
 	 * the reason we use cpu that early is that: if we get scheduled
@@ -333,13 +443,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	 * specific flags in config (and will save some cpu cycles)
 	 */
 
-	/* CCCR by default */
-	hwc->config = p4_config_pack_cccr(p4_default_cccr_conf(cpu));
+	cccr = p4_default_cccr_conf(cpu);
+	escr = p4_default_escr_conf(cpu, attr->exclude_kernel, attr->exclude_user);
+	hwc->config = p4_config_pack_escr(escr) | p4_config_pack_cccr(cccr);
 
-	/* Count user and OS events unless not requested to */
-	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
-								attr->exclude_user));
-	/* on HT machine we need a special bit */
 	if (p4_ht_active() && p4_ht_thread(cpu))
 		hwc->config = p4_set_ht_bit(hwc->config);
 
@@ -368,7 +475,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
 	 */
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
 		(u64)(p4_config_unpack_cccr(hwc->config)) &
-			~P4_CCCR_ENABLE & ~P4_CCCR_OVF);
+			~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
 }
 
 static void p4_pmu_disable_all(void)
@@ -389,27 +496,14 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int thread = p4_ht_config_thread(hwc->config);
 	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-	u64 escr_base;
-	struct p4_event_template *tpl;
-	struct p4_pmu_res *c;
+	unsigned int idx = p4_config_unpack_event(hwc->config);
+	unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
+	struct p4_event_bind *bind;
+	struct p4_cache_event_bind *bind_cache;
+	u64 escr_addr, cccr;
 
-	/*
-	 * some preparation work from per-cpu private fields
-	 * since we need to find out which ESCR to use
-	 */
-	c = &__get_cpu_var(p4_pmu_config);
-	tpl = c->tpl[hwc->idx];
-	if (!tpl) {
-		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
-		return;
-	}
-
-	if (tpl->msr) {
-		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
-		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
-	}
-
-	escr_base = (u64)tpl->escr_msr[thread];
+	bind = &p4_event_bind_map[idx];
+	escr_addr = (u64)bind->escr_msr[thread];
 
 	/*
 	 * - we dont support cascaded counters yet
@@ -418,9 +512,27 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
 	WARN_ON_ONCE(hwc->idx == 1);
 
-	(void)checking_wrmsrl(escr_base, escr_conf);
+	/* we need a real Event value */
+	escr_conf &= ~P4_ESCR_EVENT_MASK;
+	escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	/*
+	 * it could be Cache event so that we need to
+	 * set metrics into additional MSRs
+	 */
+	BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
+	if (idx_cache > P4_CACHE__NONE &&
+		idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
+		bind_cache = &p4_cache_event_bind_map[idx_cache];
+		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
+		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
+	}
+
+	(void)checking_wrmsrl(escr_addr, escr_conf);
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-		(u64)(p4_config_unpack_cccr(hwc->config)) | P4_CCCR_ENABLE);
+				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
 static void p4_pmu_enable_all(void)
@@ -516,13 +628,13 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	if (p4_ht_thread(cpu)) {
 		cccr &= ~P4_CCCR_OVF_PMI_T0;
 		cccr |= P4_CCCR_OVF_PMI_T1;
-		if (escr & P4_EVNTSEL_T0_OS) {
-			escr &= ~P4_EVNTSEL_T0_OS;
-			escr |= P4_EVNTSEL_T1_OS;
+		if (escr & P4_ESCR_T0_OS) {
+			escr &= ~P4_ESCR_T0_OS;
+			escr |= P4_ESCR_T1_OS;
 		}
-		if (escr & P4_EVNTSEL_T0_USR) {
-			escr &= ~P4_EVNTSEL_T0_USR;
-			escr |= P4_EVNTSEL_T1_USR;
+		if (escr & P4_ESCR_T0_USR) {
+			escr &= ~P4_ESCR_T0_USR;
+			escr |= P4_ESCR_T1_USR;
 		}
 		hwc->config  = p4_config_pack_escr(escr);
 		hwc->config |= p4_config_pack_cccr(cccr);
@@ -530,13 +642,13 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	} else {
 		cccr &= ~P4_CCCR_OVF_PMI_T1;
 		cccr |= P4_CCCR_OVF_PMI_T0;
-		if (escr & P4_EVNTSEL_T1_OS) {
-			escr &= ~P4_EVNTSEL_T1_OS;
-			escr |= P4_EVNTSEL_T0_OS;
+		if (escr & P4_ESCR_T1_OS) {
+			escr &= ~P4_ESCR_T1_OS;
+			escr |= P4_ESCR_T0_OS;
 		}
-		if (escr & P4_EVNTSEL_T1_USR) {
-			escr &= ~P4_EVNTSEL_T1_USR;
-			escr |= P4_EVNTSEL_T0_USR;
+		if (escr & P4_ESCR_T1_USR) {
+			escr &= ~P4_ESCR_T1_USR;
+			escr |= P4_ESCR_T0_USR;
 		}
 		hwc->config  = p4_config_pack_escr(escr);
 		hwc->config |= p4_config_pack_cccr(cccr);
@@ -606,66 +718,56 @@ static int p4_get_escr_idx(unsigned int addr)
 	return -1;
 }
 
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+			struct p4_event_bind *bind)
+{
+	int i = 0, j;
+
+	for (i = 0; i < P4_CNTR_LIMIT; i++) {
+		j = bind->cntr[thread][i++];
+		if (j == -1 || !test_bit(j, used_mask))
+			return j;
+	}
+
+	return -1;
+}
+
 static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
-
-	struct hw_perf_event *hwc;
-	struct p4_event_template *tpl;
-	struct p4_pmu_res *c;
 	int cpu = raw_smp_processor_id();
-	int escr_idx, thread, i, num;
+	struct hw_perf_event *hwc;
+	struct p4_event_bind *bind;
+	unsigned int i, thread, num;
+	int cntr_idx, escr_idx;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
 
-	c = &__get_cpu_var(p4_pmu_config);
-	/*
-	 * Firstly find out which resource events are going
-	 * to use, if ESCR+CCCR tuple is already borrowed
-	 * then get out of here
-	 */
 	for (i = 0, num = n; i < n; i++, num--) {
+
 		hwc = &cpuc->event_list[i]->hw;
-		tpl = p4_pmu_template_lookup(hwc->config);
-		if (!tpl)
-			goto done;
 		thread = p4_ht_thread(cpu);
-		escr_idx = p4_get_escr_idx(tpl->escr_msr[thread]);
-		if (escr_idx == -1)
-			goto done;
+		bind = p4_config_get_bind(hwc->config);
+		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
 
-		/* already allocated and remains on the same cpu */
 		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			cntr_idx = hwc->idx;
 			if (assign)
 				assign[i] = hwc->idx;
-			/* upstream dependent event */
-			if (unlikely(tpl->dep != -1))
-				printk_once(KERN_WARNING "PMU: Dep events are "
-					"not implemented yet\n");
 			goto reserve;
 		}
 
-		/* it may be already borrowed */
-		if (test_bit(tpl->cntr[thread], used_mask) ||
-			test_bit(escr_idx, escr_mask))
+		cntr_idx = p4_next_cntr(thread, used_mask, bind);
+		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
 			goto done;
 
-		/*
-		 * ESCR+CCCR+COUNTERs are available to use lets swap
-		 * thread specific bits, push assigned bits
-		 * back and save template into per-cpu
-		 * area (which will allow us to find out the ESCR
-		 * to be used at moment of "enable event via real MSR")
-		 */
 		p4_pmu_swap_config_ts(hwc, cpu);
-		if (assign) {
-			assign[i] = tpl->cntr[thread];
-			c->tpl[assign[i]] = tpl;
-		}
+		if (assign)
+			assign[i] = cntr_idx;
 reserve:
-		set_bit(tpl->cntr[thread], used_mask);
+		set_bit(cntr_idx, used_mask);
 		set_bit(escr_idx, escr_mask);
 	}
 
@@ -684,7 +786,7 @@ static __initconst struct x86_pmu p4_pmu = {
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
 	.raw_event		= p4_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p4_templates),
+	.max_events		= ARRAY_SIZE(p4_general_events),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
 	 * IF HT disabled we may need to use all
@@ -716,7 +818,7 @@ static __init int p4_pmu_init(void)
 	}
 
 	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-	       sizeof(hw_cache_event_ids));
+		sizeof(hw_cache_event_ids));
 
 	pr_cont("Netburst events, ");
 
-- 
cgit v1.2.3-70-g09d2


From 7c5ecaf7666617889f337296c610815b519abfa9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Mar 2010 14:51:49 +0100
Subject: perf, x86: Clean up debugctlmsr bit definitions

Move all debugctlmsr thingies into msr-index.h

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100325135413.861425293@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h           | 13 ++++++++-----
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 23 +++++++----------------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  7 ++-----
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index aef562c0a64..06e4cf0d384 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -71,11 +71,14 @@
 #define MSR_IA32_LASTINTTOIP		0x000001de
 
 /* DEBUGCTLMSR bits (others vary by model): */
-#define _DEBUGCTLMSR_LBR	0 /* last branch recording */
-#define _DEBUGCTLMSR_BTF	1 /* single-step on branches */
-
-#define DEBUGCTLMSR_LBR		(1UL << _DEBUGCTLMSR_LBR)
-#define DEBUGCTLMSR_BTF		(1UL << _DEBUGCTLMSR_BTF)
+#define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
+#define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
+#define DEBUGCTLMSR_TR			(1UL <<  6)
+#define DEBUGCTLMSR_BTS			(1UL <<  7)
+#define DEBUGCTLMSR_BTINT		(1UL <<  8)
+#define DEBUGCTLMSR_BTS_OFF_OS		(1UL <<  9)
+#define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
 
 #define MSR_IA32_MC0_CTL		0x00000400
 #define MSR_IA32_MC0_STATUS		0x00000401
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index c59678a14a2..2fea3622af7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -37,15 +37,6 @@ struct pebs_record_nhm {
 	u64 status, dla, dse, lat;
 };
 
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
 /*
  * A debug store configuration.
  *
@@ -193,15 +184,15 @@ static void intel_pmu_enable_bts(u64 config)
 
 	debugctlmsr = get_debugctlmsr();
 
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
+	debugctlmsr |= DEBUGCTLMSR_TR;
+	debugctlmsr |= DEBUGCTLMSR_BTS;
+	debugctlmsr |= DEBUGCTLMSR_BTINT;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
 
 	update_debugctlmsr(debugctlmsr);
 }
@@ -217,8 +208,8 @@ static void intel_pmu_disable_bts(void)
 	debugctlmsr = get_debugctlmsr();
 
 	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+		~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+		  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
 
 	update_debugctlmsr(debugctlmsr);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index df4c98e26c5..d202c1bece1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -12,15 +12,12 @@ enum {
  * otherwise it becomes near impossible to get a reliable stack.
  */
 
-#define X86_DEBUGCTL_LBR               		(1 << 0)
-#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI		(1 << 11)
-
 static void __intel_pmu_lbr_enable(void)
 {
 	u64 debugctl;
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl |= (X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
@@ -29,7 +26,7 @@ static void __intel_pmu_lbr_disable(void)
 	u64 debugctl;
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl &= ~(X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-- 
cgit v1.2.3-70-g09d2


From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Mar 2010 14:51:50 +0100
Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code

Support for the PMU's BTS features has been upstreamed in
v2.6.32, but we still have the old and disabled ptrace-BTS,
as Linus noticed it not so long ago.

It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without
regard for other uses (perf) and doesn't provide the flexibility
needed for perf either.

Its users are ptrace-block-step and ptrace-bts, since ptrace-bts
was never used and ptrace-block-step can be implemented using a
much simpler approach.

So axe all 3000 lines of it. That includes the *locked_memory*()
APIs in mm/mlock.c as well.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Markus Metzger <markus.t.metzger@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20100325135413.938004390@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu               |   20 -
 arch/x86/Kconfig.debug             |    9 -
 arch/x86/include/asm/ds.h          |  302 --------
 arch/x86/include/asm/processor.h   |   33 +-
 arch/x86/include/asm/ptrace-abi.h  |   57 +-
 arch/x86/include/asm/ptrace.h      |    6 -
 arch/x86/include/asm/thread_info.h |    6 +-
 arch/x86/kernel/Makefile           |    2 -
 arch/x86/kernel/cpu/intel.c        |    2 -
 arch/x86/kernel/ds.c               | 1437 ------------------------------------
 arch/x86/kernel/ds_selftest.c      |  408 ----------
 arch/x86/kernel/ds_selftest.h      |   15 -
 arch/x86/kernel/dumpstack.c        |    5 -
 arch/x86/kernel/kprobes.c          |    6 +-
 arch/x86/kernel/process.c          |    9 -
 arch/x86/kernel/process_32.c       |    8 -
 arch/x86/kernel/process_64.c       |    8 -
 arch/x86/kernel/ptrace.c           |  382 ----------
 arch/x86/kernel/step.c             |   36 +-
 arch/x86/kernel/traps.c            |    5 -
 include/linux/ftrace.h             |   12 -
 include/linux/mm.h                 |    4 -
 include/linux/ptrace.h             |   12 -
 include/linux/sched.h              |    9 -
 kernel/fork.c                      |    3 -
 kernel/ptrace.c                    |    1 -
 kernel/sched.c                     |   43 --
 kernel/trace/Kconfig               |   11 -
 kernel/trace/Makefile              |    1 -
 kernel/trace/trace.h               |    4 -
 kernel/trace/trace_entries.h       |   12 -
 kernel/trace/trace_hw_branches.c   |  312 --------
 kernel/trace/trace_selftest.c      |   57 --
 mm/mlock.c                         |   41 -
 34 files changed, 9 insertions(+), 3269 deletions(-)
 delete mode 100644 arch/x86/include/asm/ds.h
 delete mode 100644 arch/x86/kernel/ds.c
 delete mode 100644 arch/x86/kernel/ds_selftest.c
 delete mode 100644 arch/x86/kernel/ds_selftest.h
 delete mode 100644 kernel/trace/trace_hw_branches.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6..918fbb1855c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -502,23 +502,3 @@ config CPU_SUP_UMC_32
 	  CPU might render the kernel unbootable.
 
 	  If unsure, say N.
-
-config X86_DS
-	def_bool X86_PTRACE_BTS
-	depends on X86_DEBUGCTLMSR
-	select HAVE_HW_BRANCH_TRACER
-
-config X86_PTRACE_BTS
-	bool "Branch Trace Store"
-	default y
-	depends on X86_DEBUGCTLMSR
-	depends on BROKEN
-	---help---
-	  This adds a ptrace interface to the hardware's branch trace store.
-
-	  Debuggers may use it to collect an execution trace of the debugged
-	  application in order to answer the question 'how did I get here?'.
-	  Debuggers may trace user mode as well as kernel mode.
-
-	  Say Y unless there is no application development on this machine
-	  and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb..bd58c8abbfb 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -174,15 +174,6 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config X86_DS_SELFTEST
-    bool "DS selftest"
-    default y
-    depends on DEBUG_KERNEL
-    depends on X86_DS
-	---help---
-	  Perform Debug Store selftests at boot time.
-	  If in doubt, say "N".
-
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
 
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b09..00000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Debug Store (DS) support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#ifndef _ASM_X86_DS_H
-#define _ASM_X86_DS_H
-
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/err.h>
-
-
-#ifdef CONFIG_X86_DS
-
-struct task_struct;
-struct ds_context;
-struct ds_tracer;
-struct bts_tracer;
-struct pebs_tracer;
-
-typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
-typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
-
-
-/*
- * A list of features plus corresponding macros to talk about them in
- * the ds_request function's flags parameter.
- *
- * We use the enum to index an array of corresponding control bits;
- * we use the macro to index a flags bit-vector.
- */
-enum ds_feature {
-	dsf_bts = 0,
-	dsf_bts_kernel,
-#define BTS_KERNEL (1 << dsf_bts_kernel)
-	/* trace kernel-mode branches */
-
-	dsf_bts_user,
-#define BTS_USER (1 << dsf_bts_user)
-	/* trace user-mode branches */
-
-	dsf_bts_overflow,
-	dsf_bts_max,
-	dsf_pebs = dsf_bts_max,
-
-	dsf_pebs_max,
-	dsf_ctl_max = dsf_pebs_max,
-	dsf_bts_timestamps = dsf_ctl_max,
-#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
-	/* add timestamps into BTS trace */
-
-#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
-};
-
-
-/*
- * Request BTS or PEBS
- *
- * Due to alignement constraints, the actual buffer may be slightly
- * smaller than the requested or provided buffer.
- *
- * Returns a pointer to a tracer structure on success, or
- * ERR_PTR(errcode) on failure.
- *
- * The interrupt threshold is independent from the overflow callback
- * to allow users to use their own overflow interrupt handling mechanism.
- *
- * The function might sleep.
- *
- * task: the task to request recording for
- * cpu:  the cpu to request recording for
- * base: the base pointer for the (non-pageable) buffer;
- * size: the size of the provided buffer in bytes
- * ovfl: pointer to a function to be called on buffer overflow;
- *       NULL if cyclic buffer requested
- * th: the interrupt threshold in records from the end of the buffer;
- *     -1 if no interrupt threshold is requested.
- * flags: a bit-mask of the above flags
- */
-extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-					      void *base, size_t size,
-					      bts_ovfl_callback_t ovfl,
-					      size_t th, unsigned int flags);
-extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-					     bts_ovfl_callback_t ovfl,
-					     size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-						void *base, size_t size,
-						pebs_ovfl_callback_t ovfl,
-						size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
-					       void *base, size_t size,
-					       pebs_ovfl_callback_t ovfl,
-					       size_t th, unsigned int flags);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Must be called with irq's enabled.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern void ds_release_bts(struct bts_tracer *tracer);
-extern void ds_suspend_bts(struct bts_tracer *tracer);
-extern void ds_resume_bts(struct bts_tracer *tracer);
-extern void ds_release_pebs(struct pebs_tracer *tracer);
-extern void ds_suspend_pebs(struct pebs_tracer *tracer);
-extern void ds_resume_pebs(struct pebs_tracer *tracer);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Cpu tracers must call this on the traced cpu.
- * Task tracers must call ds_release_~_noirq() for themselves.
- *
- * May be called with irq's disabled.
- *
- * Returns 0 if successful;
- * -EPERM if the cpu tracer does not trace the current cpu.
- * -EPERM if the task tracer does not trace itself.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_release_bts_noirq(struct bts_tracer *tracer);
-extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
-extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
-extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
-
-
-/*
- * The raw DS buffer state as it is used for BTS and PEBS recording.
- *
- * This is the low-level, arch-dependent interface for working
- * directly on the raw trace data.
- */
-struct ds_trace {
-	/* the number of bts/pebs records */
-	size_t n;
-	/* the size of a bts/pebs record in bytes */
-	size_t size;
-	/* pointers into the raw buffer:
-	   - to the first entry */
-	void *begin;
-	/* - one beyond the last entry */
-	void *end;
-	/* - one beyond the newest entry */
-	void *top;
-	/* - the interrupt threshold */
-	void *ith;
-	/* flags given on ds_request() */
-	unsigned int flags;
-};
-
-/*
- * An arch-independent view on branch trace data.
- */
-enum bts_qualifier {
-	bts_invalid,
-#define BTS_INVALID bts_invalid
-
-	bts_branch,
-#define BTS_BRANCH bts_branch
-
-	bts_task_arrives,
-#define BTS_TASK_ARRIVES bts_task_arrives
-
-	bts_task_departs,
-#define BTS_TASK_DEPARTS bts_task_departs
-
-	bts_qual_bit_size = 4,
-	bts_qual_max = (1 << bts_qual_bit_size),
-};
-
-struct bts_struct {
-	__u64 qualifier;
-	union {
-		/* BTS_BRANCH */
-		struct {
-			__u64 from;
-			__u64 to;
-		} lbr;
-		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
-		struct {
-			__u64 clock;
-			pid_t pid;
-		} event;
-	} variant;
-};
-
-
-/*
- * The BTS state.
- *
- * This gives access to the raw DS state and adds functions to provide
- * an arch-independent view of the BTS data.
- */
-struct bts_trace {
-	struct ds_trace ds;
-
-	int (*read)(struct bts_tracer *tracer, const void *at,
-		    struct bts_struct *out);
-	int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
-};
-
-
-/*
- * The PEBS state.
- *
- * This gives access to the raw DS state and the PEBS-specific counter
- * reset value.
- */
-struct pebs_trace {
-	struct ds_trace ds;
-
-	/* the number of valid counters in the below array */
-	unsigned int counters;
-
-#define MAX_PEBS_COUNTERS 4
-	/* the counter reset value */
-	unsigned long long counter_reset[MAX_PEBS_COUNTERS];
-};
-
-
-/*
- * Read the BTS or PEBS trace.
- *
- * Returns a view on the trace collected for the parameter tracer.
- *
- * The view remains valid as long as the traced task is not running or
- * the tracer is suspended.
- * Writes into the trace buffer are not reflected.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
-extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
-
-
-/*
- * Reset the write pointer of the BTS/PEBS buffer.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_reset_bts(struct bts_tracer *tracer);
-extern int ds_reset_pebs(struct pebs_tracer *tracer);
-
-/*
- * Set the PEBS counter reset value.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_pebs()
- * counter: the index of the counter
- * value: the new counter reset value
- */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
-			     unsigned int counter, u64 value);
-
-/*
- * Initialization
- */
-struct cpuinfo_x86;
-extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
-
-/*
- * Context switch work
- */
-extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-
-#else /* CONFIG_X86_DS */
-
-struct cpuinfo_x86;
-static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
-static inline void ds_switch_to(struct task_struct *prev,
-				struct task_struct *next) {}
-
-#endif /* CONFIG_X86_DS */
-#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703..5bec21a66dc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,7 +21,6 @@ struct mm_struct;
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
-#include <asm/ds.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -29,6 +28,7 @@ struct mm_struct;
 #include <linux/threads.h>
 #include <linux/math64.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 #define HBP_NUM 4
 /*
@@ -473,10 +473,6 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
-/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
-	unsigned long	debugctlmsr;
-	/* Debug Store context; see asm/ds.h */
-	struct ds_context	*ds_ctx;
 };
 
 static inline unsigned long native_get_debugreg(int regno)
@@ -814,21 +810,6 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
 }
 
-static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
-{
-	u64 debugctlmsr = 0;
-	u32 val1, val2;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return 0;
-#endif
-	rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
-	debugctlmsr = val1 | ((u64)val2 << 32);
-
-	return debugctlmsr;
-}
-
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -838,18 +819,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 
-static inline void update_debugctlmsr_on_cpu(int cpu,
-					     unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return;
-#endif
-	wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
-		     (u32)((u64)debugctlmsr),
-		     (u32)((u64)debugctlmsr >> 32));
-}
-
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 86723035a51..52b098a6eeb 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -82,61 +82,6 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
-
-/* configuration/status structure used in PTRACE_BTS_CONFIG and
-   PTRACE_BTS_STATUS commands.
-*/
-struct ptrace_bts_config {
-	/* requested or actual size of BTS buffer in bytes */
-	__u32 size;
-	/* bitmask of below flags */
-	__u32 flags;
-	/* buffer overflow signal */
-	__u32 signal;
-	/* actual size of bts_struct in bytes */
-	__u32 bts_size;
-};
-#endif /* __ASSEMBLY__ */
-
-#define PTRACE_BTS_O_TRACE	0x1 /* branch trace */
-#define PTRACE_BTS_O_SCHED	0x2 /* scheduling events w/ jiffies */
-#define PTRACE_BTS_O_SIGNAL     0x4 /* send SIG<signal> on buffer overflow
-				       instead of wrapping around */
-#define PTRACE_BTS_O_ALLOC	0x8 /* (re)allocate buffer */
-
-#define PTRACE_BTS_CONFIG	40
-/* Configure branch trace recording.
-   ADDR points to a struct ptrace_bts_config.
-   DATA gives the size of that buffer.
-   A new buffer is allocated, if requested in the flags.
-   An overflow signal may only be requested for new buffers.
-   Returns the number of bytes read.
-*/
-#define PTRACE_BTS_STATUS	41
-/* Return the current configuration in a struct ptrace_bts_config
-   pointed to by ADDR; DATA gives the size of that buffer.
-   Returns the number of bytes written.
-*/
-#define PTRACE_BTS_SIZE		42
-/* Return the number of available BTS records for draining.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_GET		43
-/* Get a single BTS record.
-   DATA defines the index into the BTS array, where 0 is the newest
-   entry, and higher indices refer to older entries.
-   ADDR is pointing to struct bts_struct (see asm/ds.h).
-*/
-#define PTRACE_BTS_CLEAR	44
-/* Clear the BTS buffer.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_DRAIN	45
-/* Read all available BTS records and clear the buffer.
-   ADDR points to an array of struct bts_struct.
-   DATA gives the size of that buffer.
-   BTS records are read from oldest to newest.
-   Returns number of BTS records drained.
-*/
+#endif
 
 #endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 69a686a7dff..78cd1ea9450 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -289,12 +289,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void ptrace_bts_untrace(struct task_struct *tsk);
-
-#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e96..dc85e12d140 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,8 +92,6 @@ struct thread_info {
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
-#define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
-#define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 
@@ -115,8 +113,6 @@ struct thread_info {
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
-#define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
-#define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 
@@ -147,7 +143,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_NOTSC)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e..e77b2208372 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
-obj-$(CONFIG_X86_DS)		+= ds.o
-obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af3..d72377c41c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
-#include <asm/ds.h>
 #include <asm/bugs.h>
 #include <asm/cpu.h>
 
@@ -367,7 +366,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_BTS);
 		if (!(l1 & (1<<12)))
 			set_cpu_cap(c, X86_FEATURE_PEBS);
-		ds_init_intel(c);
 	}
 
 	if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e..00000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-
-#include <asm/ds.h>
-
-#include "ds_selftest.h"
-
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
-	/* The name of the configuration: */
-	const char		*name;
-
-	/* The size of pointer-typed fields in DS, BTS, and PEBS: */
-	unsigned char		sizeof_ptr_field;
-
-	/* The size of a BTS/PEBS record in bytes: */
-	unsigned char		sizeof_rec[2];
-
-	/* The number of pebs counter reset values in the DS structure. */
-	unsigned char		nr_counter_reset;
-
-	/* Control bit-masks indexed by enum ds_feature: */
-	unsigned long		ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-
-
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS		0x80
-
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS		(3 * 8)
-
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT		(1 << 3)
-
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS	8
-
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE	8
-
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL				  \
-	( ds_cfg.ctl[dsf_bts]			| \
-	  ds_cfg.ctl[dsf_bts_kernel]		| \
-	  ds_cfg.ctl[dsf_bts_user]		| \
-	  ds_cfg.ctl[dsf_bts_overflow] )
-
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
-	/* The DS context (partially) owned by this tracer. */
-	struct ds_context	*context;
-	/* The buffer provided on ds_request() and its size in bytes. */
-	void			*buffer;
-	size_t			size;
-};
-
-struct bts_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct bts_trace	trace;
-
-	/* Buffer overflow notification function: */
-	bts_ovfl_callback_t	ovfl;
-
-	/* Active flags affecting trace collection. */
-	unsigned int		flags;
-};
-
-struct pebs_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct pebs_trace	trace;
-
-	/* Buffer overflow notification function: */
-	pebs_ovfl_callback_t	ovfl;
-};
-
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into BTS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into PEBS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
-	ds_buffer_base = 0,
-	ds_index,
-	ds_absolute_maximum,
-	ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
-	ds_bts = 0,
-	ds_pebs
-};
-
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	return *(unsigned long *)base;
-}
-
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
-       unsigned long value)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	(*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- *   =0  no tracers
- *   >0  number of per-thread tracers
- *   <0  number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-
-static inline int get_tracer(struct task_struct *task)
-{
-	int error;
-
-	spin_lock_irq(&ds_lock);
-
-	if (task) {
-		error = -EPERM;
-		if (atomic_read(&tracers) < 0)
-			goto out;
-		atomic_inc(&tracers);
-	} else {
-		error = -EPERM;
-		if (atomic_read(&tracers) > 0)
-			goto out;
-		atomic_dec(&tracers);
-	}
-
-	error = 0;
-out:
-	spin_unlock_irq(&ds_lock);
-	return error;
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
-	if (task)
-		atomic_dec(&tracers);
-	else
-		atomic_inc(&tracers);
-}
-
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- *   attached context.
- * - in the latter case, we use a static array of per-cpu context
- *   pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
-	/* The DS configuration; goes into MSR_IA32_DS_AREA: */
-	unsigned char		ds[MAX_SIZEOF_DS];
-
-	/* The owner of the BTS and PEBS configuration, respectively: */
-	struct bts_tracer	*bts_master;
-	struct pebs_tracer	*pebs_master;
-
-	/* Use count: */
-	unsigned long		count;
-
-	/* Pointer to the context pointer field: */
-	struct ds_context	**this;
-
-	/* The traced task; NULL for cpu tracing: */
-	struct task_struct	*task;
-
-	/* The traced cpu; only valid if task is NULL: */
-	int			cpu;
-};
-
-static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
-
-
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
-	struct ds_context **p_context =
-		(task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
-	struct ds_context *context = NULL;
-	struct ds_context *new_context = NULL;
-
-	/* Chances are small that we already have a context. */
-	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
-	if (!new_context)
-		return NULL;
-
-	spin_lock_irq(&ds_lock);
-
-	context = *p_context;
-	if (likely(!context)) {
-		context = new_context;
-
-		context->this = p_context;
-		context->task = task;
-		context->cpu = cpu;
-		context->count = 0;
-
-		*p_context = context;
-	}
-
-	context->count++;
-
-	spin_unlock_irq(&ds_lock);
-
-	if (context != new_context)
-		kfree(new_context);
-
-	return context;
-}
-
-static void ds_put_context(struct ds_context *context)
-{
-	struct task_struct *task;
-	unsigned long irq;
-
-	if (!context)
-		return;
-
-	spin_lock_irqsave(&ds_lock, irq);
-
-	if (--context->count) {
-		spin_unlock_irqrestore(&ds_lock, irq);
-		return;
-	}
-
-	*(context->this) = NULL;
-
-	task = context->task;
-
-	if (task)
-		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-	/*
-	 * We leave the (now dangling) pointer to the DS configuration in
-	 * the DS_AREA msr. This is as good or as bad as replacing it with
-	 * NULL - the hardware would crash if we enabled tracing.
-	 *
-	 * This saves us some problems with having to write an msr on a
-	 * different cpu while preventing others from doing the same for the
-	 * next context for that same cpu.
-	 */
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-	/* The context might still be in use for context switching. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	kfree(context);
-}
-
-static void ds_install_ds_area(struct ds_context *context)
-{
-	unsigned long ds;
-
-	ds = (unsigned long)context->ds;
-
-	/*
-	 * There is a race between the bts master and the pebs master.
-	 *
-	 * The thread/cpu access is synchronized via get/put_cpu() for
-	 * task tracing and via wrmsr_on_cpu for cpu tracing.
-	 *
-	 * If bts and pebs are collected for the same task or same cpu,
-	 * the same confiuration is written twice.
-	 */
-	if (context->task) {
-		get_cpu();
-		if (context->task == current)
-			wrmsrl(MSR_IA32_DS_AREA, ds);
-		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
-		put_cpu();
-	} else
-		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
-			     (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
-	switch (qual) {
-	case ds_bts:
-		if (context->bts_master &&
-		    context->bts_master->ovfl)
-			context->bts_master->ovfl(context->bts_master);
-		break;
-	case ds_pebs:
-		if (context->pebs_master &&
-		    context->pebs_master->ovfl)
-			context->pebs_master->ovfl(context->pebs_master);
-		break;
-	}
-}
-
-
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual:    the buffer type
- * record:  the data to write
- * size:    the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
-		    const void *record, size_t size)
-{
-	int bytes_written = 0;
-
-	if (!record)
-		return -EINVAL;
-
-	while (size) {
-		unsigned long base, index, end, write_end, int_th;
-		unsigned long write_size, adj_write_size;
-
-		/*
-		 * Write as much as possible without producing an
-		 * overflow interrupt.
-		 *
-		 * Interrupt_threshold must either be
-		 * - bigger than absolute_maximum or
-		 * - point to a record between buffer_base and absolute_maximum
-		 *
-		 * Index points to a valid record.
-		 */
-		base   = ds_get(context->ds, qual, ds_buffer_base);
-		index  = ds_get(context->ds, qual, ds_index);
-		end    = ds_get(context->ds, qual, ds_absolute_maximum);
-		int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
-		write_end = min(end, int_th);
-
-		/*
-		 * If we are already beyond the interrupt threshold,
-		 * we fill the entire buffer.
-		 */
-		if (write_end <= index)
-			write_end = end;
-
-		if (write_end <= index)
-			break;
-
-		write_size = min((unsigned long) size, write_end - index);
-		memcpy((void *)index, record, write_size);
-
-		record = (const char *)record + write_size;
-		size -= write_size;
-		bytes_written += write_size;
-
-		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
-		adj_write_size *= ds_cfg.sizeof_rec[qual];
-
-		/* Zero out trailing bytes. */
-		memset((char *)index + write_size, 0,
-		       adj_write_size - write_size);
-		index += adj_write_size;
-
-		if (index >= end)
-			index = base;
-		ds_set(context->ds, qual, ds_index, index);
-
-		if (index >= int_th)
-			ds_overflow(context, qual);
-	}
-
-	return bytes_written;
-}
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-
-enum bts_field {
-	bts_from,
-	bts_to,
-	bts_flags,
-
-	bts_qual		= bts_from,
-	bts_clock		= bts_to,
-	bts_pid			= bts_flags,
-
-	bts_qual_mask		= (bts_qual_max - 1),
-	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)
-};
-
-static inline unsigned long bts_get(const char *base, unsigned long field)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, unsigned long field, unsigned long val)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	(*(unsigned long *)base) = val;
-}
-
-
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- *   writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (at < tracer->trace.ds.begin)
-		return -EINVAL;
-
-	if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
-		return -EINVAL;
-
-	memset(out, 0, sizeof(*out));
-	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
-		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-		out->variant.event.clock = bts_get(at, bts_clock);
-		out->variant.event.pid = bts_get(at, bts_pid);
-	} else {
-		out->qualifier = bts_branch;
-		out->variant.lbr.from = bts_get(at, bts_from);
-		out->variant.lbr.to   = bts_get(at, bts_to);
-
-		if (!out->variant.lbr.from && !out->variant.lbr.to)
-			out->qualifier = bts_invalid;
-	}
-
-	return ds_cfg.sizeof_rec[ds_bts];
-}
-
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
-	unsigned char raw[MAX_SIZEOF_BTS];
-
-	if (!tracer)
-		return -EINVAL;
-
-	if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
-		return -EOVERFLOW;
-
-	switch (in->qualifier) {
-	case bts_invalid:
-		bts_set(raw, bts_from, 0);
-		bts_set(raw, bts_to, 0);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_branch:
-		bts_set(raw, bts_from, in->variant.lbr.from);
-		bts_set(raw, bts_to,   in->variant.lbr.to);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_task_arrives:
-	case bts_task_departs:
-		bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-		bts_set(raw, bts_clock, in->variant.event.clock);
-		bts_set(raw, bts_pid, in->variant.event.pid);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return ds_write(tracer->ds.context, ds_bts, raw,
-			ds_cfg.sizeof_rec[ds_bts]);
-}
-
-
-static void ds_write_config(struct ds_context *context,
-			    struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
-	ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
-	ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
-	ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-
-static void ds_read_config(struct ds_context *context,
-			   struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
-	cfg->top = (void *)ds_get(ds, qual, ds_index);
-	cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
-	cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
-			     void *base, size_t size, size_t ith,
-			     unsigned int flags) {
-	unsigned long buffer, adj;
-
-	/*
-	 * Adjust the buffer address and size to meet alignment
-	 * constraints:
-	 * - buffer is double-word aligned
-	 * - size is multiple of record size
-	 *
-	 * We checked the size at the very beginning; we have enough
-	 * space to do the adjustment.
-	 */
-	buffer = (unsigned long)base;
-
-	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
-	buffer += adj;
-	size   -= adj;
-
-	trace->n = size / ds_cfg.sizeof_rec[qual];
-	trace->size = ds_cfg.sizeof_rec[qual];
-
-	size = (trace->n * trace->size);
-
-	trace->begin = (void *)buffer;
-	trace->top = trace->begin;
-	trace->end = (void *)(buffer + size);
-	/*
-	 * The value for 'no threshold' is -1, which will set the
-	 * threshold outside of the buffer, just like we want it.
-	 */
-	ith *= ds_cfg.sizeof_rec[qual];
-	trace->ith = (void *)(buffer + size - ith);
-
-	trace->flags = flags;
-}
-
-
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
-		      enum ds_qualifier qual, struct task_struct *task,
-		      int cpu, void *base, size_t size, size_t th)
-{
-	struct ds_context *context;
-	int error;
-	size_t req_size;
-
-	error = -EOPNOTSUPP;
-	if (!ds_cfg.sizeof_rec[qual])
-		goto out;
-
-	error = -EINVAL;
-	if (!base)
-		goto out;
-
-	req_size = ds_cfg.sizeof_rec[qual];
-	/* We might need space for alignment adjustments. */
-	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
-		req_size += DS_ALIGNMENT;
-
-	error = -EINVAL;
-	if (size < req_size)
-		goto out;
-
-	if (th != (size_t)-1) {
-		th *= ds_cfg.sizeof_rec[qual];
-
-		error = -EINVAL;
-		if (size <= th)
-			goto out;
-	}
-
-	tracer->buffer = base;
-	tracer->size = size;
-
-	error = -ENOMEM;
-	context = ds_get_context(task, cpu);
-	if (!context)
-		goto out;
-	tracer->context = context;
-
-	/*
-	 * Defer any tracer-specific initialization work for the context until
-	 * context ownership has been clarified.
-	 */
-
-	error = 0;
- out:
-	return error;
-}
-
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
-					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl, size_t th,
-					 unsigned int flags)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_bts, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the bts part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->bts_master)
-		goto out_unlock;
-	tracer->ds.context->bts_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the bts part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	ds_install_ds_area(tracer->ds.context);
-
-	tracer->trace.read  = bts_read;
-	tracer->trace.write = bts_write;
-
-	/* Start tracing. */
-	ds_resume_bts(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-				       void *base, size_t size,
-				       bts_ovfl_callback_t ovfl,
-				       size_t th, unsigned int flags)
-{
-	return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-				      bts_ovfl_callback_t ovfl,
-				      size_t th, unsigned int flags)
-{
-	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
-					   void *base, size_t size,
-					   pebs_ovfl_callback_t ovfl, size_t th,
-					   unsigned int flags)
-{
-	struct pebs_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_pebs, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the pebs part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->pebs_master)
-		goto out_unlock;
-	tracer->ds.context->pebs_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the pebs part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-	ds_install_ds_area(tracer->ds.context);
-
-	/* Start tracing. */
-	ds_resume_pebs(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-					 void *base, size_t size,
-					 pebs_ovfl_callback_t ovfl,
-					 size_t th, unsigned int flags)
-{
-	return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
-					pebs_ovfl_callback_t ovfl,
-					size_t th, unsigned int flags)
-{
-	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static void ds_free_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
-	tracer->ds.context->bts_master = NULL;
-
-	/* Make sure tracing stopped and the tracer is not in use. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_bts(struct bts_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_bts(tracer);
-	ds_free_bts(tracer);
-}
-
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_bts_noirq(tracer);
-	ds_free_bts(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void update_task_debugctlmsr(struct task_struct *task,
-				    unsigned long debugctlmsr)
-{
-	task->thread.debugctlmsr = debugctlmsr;
-
-	get_cpu();
-	if (task == current)
-		update_debugctlmsr(debugctlmsr);
-	put_cpu();
-}
-
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
-	unsigned long control;
-
-	control = ds_cfg.ctl[dsf_bts];
-	if (!(tracer->trace.ds.flags & BTS_KERNEL))
-		control |= ds_cfg.ctl[dsf_bts_kernel];
-	if (!(tracer->trace.ds.flags & BTS_USER))
-		control |= ds_cfg.ctl[dsf_bts_user];
-
-	return control;
-}
-
-void ds_resume_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
-	tracer->ds.context->pebs_master = NULL;
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_pebs(tracer);
-	ds_free_pebs(tracer);
-}
-
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_pebs_noirq(tracer);
-	ds_free_pebs(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	return &tracer->trace;
-}
-
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-
-	tracer->trace.counters = ds_cfg.nr_counter_reset;
-	memcpy(tracer->trace.counter_reset,
-	       tracer->ds.context->ds +
-	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
-	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-
-	return &tracer->trace;
-}
-
-int ds_reset_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
-		      unsigned int counter, u64 value)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (ds_cfg.nr_counter_reset < counter)
-		return -EINVAL;
-
-	*(u64 *)(tracer->ds.context->ds +
-		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
-		 (counter * PEBS_RESET_FIELD_SIZE)) = value;
-
-	return 0;
-}
-
-static const struct ds_configuration ds_cfg_netburst = {
-	.name = "Netburst",
-	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
-	.ctl[dsf_bts_kernel]	= (1 << 5),
-	.ctl[dsf_bts_user]	= (1 << 6),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
-	.name = "Pentium M",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
-	.name = "Core 2/Atom",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
-	.name = "Core i7",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 4,
-};
-
-static void
-ds_configure(const struct ds_configuration *cfg,
-	     struct cpuinfo_x86 *cpu)
-{
-	unsigned long nr_pebs_fields = 0;
-
-	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-
-#ifdef __i386__
-	nr_pebs_fields = 10;
-#else
-	nr_pebs_fields = 18;
-#endif
-
-	/*
-	 * Starting with version 2, architectural performance
-	 * monitoring supports a format specifier.
-	 */
-	if ((cpuid_eax(0xa) & 0xff) > 1) {
-		unsigned long perf_capabilities, format;
-
-		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
-		format = (perf_capabilities >> 8) & 0xf;
-
-		switch (format) {
-		case 0:
-			nr_pebs_fields = 18;
-			break;
-		case 1:
-			nr_pebs_fields = 22;
-			break;
-		default:
-			printk(KERN_INFO
-			       "[ds] unknown PEBS format: %lu\n", format);
-			nr_pebs_fields = 0;
-			break;
-		}
-	}
-
-	memset(&ds_cfg, 0, sizeof(ds_cfg));
-	ds_cfg = *cfg;
-
-	ds_cfg.sizeof_ptr_field =
-		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-
-	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
-	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-
-	if (!cpu_has(cpu, X86_FEATURE_BTS)) {
-		ds_cfg.sizeof_rec[ds_bts] = 0;
-		printk(KERN_INFO "[ds] bts not available\n");
-	}
-	if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
-		ds_cfg.sizeof_rec[ds_pebs] = 0;
-		printk(KERN_INFO "[ds] pebs not available\n");
-	}
-
-	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
-	       8 * ds_cfg.sizeof_ptr_field);
-	printk("bts/pebs record: %u/%u bytes\n",
-	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-
-	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
-	/* Only configure the first cpu. Others are identical. */
-	if (ds_cfg.name)
-		return;
-
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0x9:
-		case 0xd: /* Pentium M */
-			ds_configure(&ds_cfg_pentium_m, c);
-			break;
-		case 0xf:
-		case 0x17: /* Core2 */
-		case 0x1c: /* Atom */
-			ds_configure(&ds_cfg_core2_atom, c);
-			break;
-		case 0x1a: /* Core i7 */
-			ds_configure(&ds_cfg_core_i7, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	case 0xf:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_netburst, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	default:
-		/* Sorry, don't know about them. */
-		break;
-	}
-}
-
-static inline void ds_take_timestamp(struct ds_context *context,
-				     enum bts_qualifier qualifier,
-				     struct task_struct *task)
-{
-	struct bts_tracer *tracer = context->bts_master;
-	struct bts_struct ts;
-
-	/* Prevent compilers from reading the tracer pointer twice. */
-	barrier();
-
-	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
-		return;
-
-	memset(&ts, 0, sizeof(ts));
-	ts.qualifier		= qualifier;
-	ts.variant.event.clock	= trace_clock_global();
-	ts.variant.event.pid	= task->pid;
-
-	bts_write(tracer, &ts);
-}
-
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
-	struct ds_context *prev_ctx	= prev->thread.ds_ctx;
-	struct ds_context *next_ctx	= next->thread.ds_ctx;
-	unsigned long debugctlmsr	= next->thread.debugctlmsr;
-
-	/* Make sure all data is read before we start. */
-	barrier();
-
-	if (prev_ctx) {
-		update_debugctlmsr(0);
-
-		ds_take_timestamp(prev_ctx, bts_task_departs, prev);
-	}
-
-	if (next_ctx) {
-		ds_take_timestamp(next_ctx, bts_task_arrives, next);
-
-		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
-	}
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static __init int ds_selftest(void)
-{
-	if (ds_cfg.sizeof_rec[ds_bts]) {
-		int error;
-
-		error = ds_selftest_bts();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling bts.\n");
-			ds_cfg.sizeof_rec[ds_bts] = 0;
-		}
-	}
-
-	if (ds_cfg.sizeof_rec[ds_pebs]) {
-		int error;
-
-		error = ds_selftest_pebs();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling pebs.\n");
-			ds_cfg.sizeof_rec[ds_pebs] = 0;
-		}
-	}
-
-	return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab9..00000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#include "ds_selftest.h"
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/ds.h>
-
-
-#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */
-
-struct ds_selftest_bts_conf {
-	struct bts_tracer *tracer;
-	int error;
-	int (*suspend)(struct bts_tracer *);
-	int (*resume)(struct bts_tracer *);
-};
-
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
-	int error = 0;
-
-	if (!trace) {
-		printk(KERN_CONT "failed to access trace...");
-		/* Bail out. Other tests are pointless. */
-		return -1;
-	}
-
-	if (!trace->read) {
-		printk(KERN_CONT "bts read not available...");
-		error = -1;
-	}
-
-	/* Do some sanity checks on the trace configuration. */
-	if (!trace->ds.n) {
-		printk(KERN_CONT "empty bts buffer...");
-		error = -1;
-	}
-	if (!trace->ds.size) {
-		printk(KERN_CONT "bad bts trace setup...");
-		error = -1;
-	}
-	if (trace->ds.end !=
-	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
-		printk(KERN_CONT "bad bts buffer setup...");
-		error = -1;
-	}
-	/*
-	 * We allow top in [begin; end], since its not clear when the
-	 * overflow adjustment happens: after the increment or before the
-	 * write.
-	 */
-	if ((trace->ds.top < trace->ds.begin) ||
-	    (trace->ds.end < trace->ds.top)) {
-		printk(KERN_CONT "bts top out of bounds...");
-		error = -1;
-	}
-
-	return error;
-}
-
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
-				const struct bts_trace *trace,
-				const void *from, const void *to)
-{
-	const unsigned char *at;
-
-	/*
-	 * Check a few things which do not belong to this test.
-	 * They should be covered by other tests.
-	 */
-	if (!trace)
-		return -1;
-
-	if (!trace->read)
-		return -1;
-
-	if (to < from)
-		return -1;
-
-	if (from < trace->ds.begin)
-		return -1;
-
-	if (trace->ds.end < to)
-		return -1;
-
-	if (!trace->ds.size)
-		return -1;
-
-	/* Now to the test itself. */
-	for (at = from; (void *)at < to; at += trace->ds.size) {
-		struct bts_struct bts;
-		unsigned long index;
-		int error;
-
-		if (((void *)at - trace->ds.begin) % trace->ds.size) {
-			printk(KERN_CONT
-			       "read from non-integer index...");
-			return -1;
-		}
-		index = ((void *)at - trace->ds.begin) / trace->ds.size;
-
-		memset(&bts, 0, sizeof(bts));
-		error = trace->read(tracer, at, &bts);
-		if (error < 0) {
-			printk(KERN_CONT
-			       "error reading bts trace at [%lu] (0x%p)...",
-			       index, at);
-			return error;
-		}
-
-		switch (bts.qualifier) {
-		case BTS_BRANCH:
-			break;
-		default:
-			printk(KERN_CONT
-			       "unexpected bts entry %llu at [%lu] (0x%p)...",
-			       bts.qualifier, index, at);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-static void ds_selftest_bts_cpu(void *arg)
-{
-	struct ds_selftest_bts_conf *conf = arg;
-	const struct bts_trace *trace;
-	void *top;
-
-	if (IS_ERR(conf->tracer)) {
-		conf->error = PTR_ERR(conf->tracer);
-		conf->tracer = NULL;
-
-		printk(KERN_CONT
-		       "initialization failed (err: %d)...", conf->error);
-		return;
-	}
-
-	/* We should meanwhile have enough trace. */
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	/* Let's see if we can access the trace. */
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	/* If everything went well, we should have a few trace entries. */
-	if (trace->ds.top == trace->ds.begin) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning, but continue.
-		 */
-		printk(KERN_CONT "no trace/overflow...");
-	}
-
-	/* Let's try to read the trace we collected. */
-	conf->error =
-		ds_selftest_bts_read(conf->tracer, trace,
-				     trace->ds.begin, trace->ds.top);
-	if (conf->error < 0)
-		return;
-
-	/*
-	 * Let's read the trace again.
-	 * Since we suspended tracing, we should get the same result.
-	 */
-	top = trace->ds.top;
-
-	trace = ds_read_bts(conf->tracer);
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (top != trace->ds.top) {
-		printk(KERN_CONT "suspend not working...");
-		conf->error = -1;
-		return;
-	}
-
-	/* Let's collect some more trace - see if resume is working. */
-	conf->error = conf->resume(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (trace->ds.top == top) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning and check the full trace.
-		 */
-		printk(KERN_CONT
-		       "no resume progress/overflow...");
-
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else if (trace->ds.top < top) {
-		/*
-		 * We had a buffer overflow - the entire buffer should
-		 * contain trace records.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else {
-		/*
-		 * It is quite likely that the buffer did not overflow.
-		 * Let's just check the delta trace.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace, top,
-					     trace->ds.top);
-	}
-	if (conf->error < 0)
-		return;
-
-	conf->error = 0;
-}
-
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_suspend_bts(tracer);
-	return 0;
-}
-
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_resume_bts(tracer);
-	return 0;
-}
-
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
-	(void)ds_release_bts_noirq(tracer);
-}
-
-static int ds_selftest_bts_bad_release_noirq(int cpu,
-					     struct bts_tracer *tracer)
-{
-	int error = -EPERM;
-
-	/* Try to release the tracer on the wrong cpu. */
-	get_cpu();
-	if (cpu != smp_processor_id()) {
-		error = ds_release_bts_noirq(tracer);
-		if (error != -EPERM)
-			printk(KERN_CONT "release on wrong cpu...");
-	}
-	put_cpu();
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		ds_release_bts(tracer);
-		error = 0;
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "cpu/task tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		error = 0;
-		ds_release_bts(tracer);
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "task/cpu tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-int ds_selftest_bts(void)
-{
-	struct ds_selftest_bts_conf conf;
-	unsigned char buffer[BUFFER_SIZE], *small_buffer;
-	unsigned long irq;
-	int cpu;
-
-	printk(KERN_INFO "[ds] bts selftest...");
-	conf.error = 0;
-
-	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		conf.suspend = ds_suspend_bts_wrap;
-		conf.resume = ds_resume_bts_wrap;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		ds_selftest_bts_cpu(&conf);
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		ds_release_bts(conf.tracer);
-		if (conf.error < 0)
-			goto out;
-
-		conf.suspend = ds_suspend_bts_noirq;
-		conf.resume = ds_resume_bts_noirq;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
-		if (conf.error >= 0) {
-			conf.error =
-				ds_selftest_bts_bad_release_noirq(cpu,
-								  conf.tracer);
-			/* We must not release the tracer twice. */
-			if (conf.error < 0)
-				conf.tracer = NULL;
-		}
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
-					 conf.tracer, 1);
-		if (conf.error < 0)
-			goto out;
-	}
-
-	conf.suspend = ds_suspend_bts_wrap;
-	conf.resume = ds_resume_bts_wrap;
-	conf.tracer =
-		ds_request_bts_task(current, buffer, BUFFER_SIZE,
-				    NULL, (size_t)-1, BTS_KERNEL);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts(conf.tracer);
-	if (conf.error < 0)
-		goto out;
-
-	conf.suspend = ds_suspend_bts_noirq;
-	conf.resume = ds_resume_bts_noirq;
-	conf.tracer =
-		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
-				   NULL, (size_t)-1, BTS_KERNEL);
-	local_irq_save(irq);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts_noirq(conf.tracer);
-	local_irq_restore(irq);
-	if (conf.error < 0)
-		goto out;
-
-	conf.error = 0;
- out:
-	put_online_cpus();
-	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-
-	return conf.error;
-}
-
-int ds_selftest_pebs(void)
-{
-	return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c666..00000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780..c89a386930b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
-	/* notify the hw-branch tracer so it may disable tracing and
-	   add the last trace to the trace buffer -
-	   the earlier this happens, the more useful the trace. */
-	trace_hw_branch_oops();
-
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c..7a880ad3a20 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,12 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 static void __kprobes clear_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(0);
+	/* XXX */
 }
 
 static void __kprobes restore_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(current->thread.debugctlmsr);
+	/* XXX */
 }
 
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ad9540676fc..1a60beb32ed 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 unsigned long idle_halt;
@@ -50,8 +49,6 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
-
-	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
 void free_thread_info(struct thread_info *ti)
@@ -198,12 +195,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30..75090c589b7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
 #include <asm/cpu.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	return err;
 }
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dc9690b4c4c..cc4258f2beb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		if (err)
 			goto out;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	err = 0;
 out:
 	if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a503b1fd04e..f2fd3b80e56 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
 /*
  * Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- *	Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
  */
 
 #include <linux/kernel.h>
@@ -21,7 +18,6 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
-#include <linux/workqueue.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
 
@@ -35,7 +31,6 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
-#include <asm/ds.h>
 #include <asm/hw_breakpoint.h>
 
 #include "tls.h"
@@ -788,342 +783,6 @@ static int ioperm_get(struct task_struct *target,
 				   0, IO_BITMAP_BYTES);
 }
 
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
-	/* The branch trace handle. */
-	struct bts_tracer	*tracer;
-
-	/* The buffer used to store the branch trace and its size. */
-	void			*buffer;
-	unsigned int		size;
-
-	/* The mm that paid for the above buffer. */
-	struct mm_struct	*mm;
-
-	/* The task this context belongs to. */
-	struct task_struct	*task;
-
-	/* The signal to send on a bts buffer overflow. */
-	unsigned int		bts_ovfl_signal;
-
-	/* The work struct to destroy a context. */
-	struct work_struct	work;
-};
-
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
-{
-	void *buffer = NULL;
-	int err = -ENOMEM;
-
-	err = account_locked_memory(current->mm, current->signal->rlim, size);
-	if (err < 0)
-		return err;
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		goto out_refund;
-
-	context->buffer = buffer;
-	context->size = size;
-	context->mm = get_task_mm(current);
-
-	return 0;
-
- out_refund:
-	refund_locked_memory(current->mm, size);
-	return err;
-}
-
-static inline void free_bts_buffer(struct bts_context *context)
-{
-	if (!context->buffer)
-		return;
-
-	kfree(context->buffer);
-	context->buffer = NULL;
-
-	refund_locked_memory(context->mm, context->size);
-	context->size = 0;
-
-	mmput(context->mm);
-	context->mm = NULL;
-}
-
-static void free_bts_context_work(struct work_struct *w)
-{
-	struct bts_context *context;
-
-	context = container_of(w, struct bts_context, work);
-
-	ds_release_bts(context->tracer);
-	put_task_struct(context->task);
-	free_bts_buffer(context);
-	kfree(context);
-}
-
-static inline void free_bts_context(struct bts_context *context)
-{
-	INIT_WORK(&context->work, free_bts_context_work);
-	schedule_work(&context->work);
-}
-
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
-	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (context) {
-		context->task = task;
-		task->bts = context;
-
-		get_task_struct(task);
-	}
-
-	return context;
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
-				  struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct bts_struct bts;
-	const unsigned char *at;
-	int error;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	at = trace->ds.top - ((index + 1) * trace->ds.size);
-	if ((void *)at < trace->ds.begin)
-		at += (trace->ds.n * trace->ds.size);
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	error = trace->read(context->tracer, at, &bts);
-	if (error < 0)
-		return error;
-
-	if (copy_to_user(out, &bts, sizeof(bts)))
-		return -EFAULT;
-
-	return sizeof(bts);
-}
-
-static int ptrace_bts_drain(struct task_struct *child,
-			    long size,
-			    struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	const unsigned char *at;
-	int error, drained = 0;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	if (size < (trace->ds.top - trace->ds.begin))
-		return -EIO;
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     out++, drained++, at += trace->ds.size) {
-		struct bts_struct bts;
-
-		error = trace->read(context->tracer, at, &bts);
-		if (error < 0)
-			return error;
-
-		if (copy_to_user(out, &bts, sizeof(bts)))
-			return -EFAULT;
-	}
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	error = ds_reset_bts(context->tracer);
-	if (error < 0)
-		return error;
-
-	return drained;
-}
-
-static int ptrace_bts_config(struct task_struct *child,
-			     long cfg_size,
-			     const struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	struct ptrace_bts_config cfg;
-	unsigned int flags = 0;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-		return -EFAULT;
-
-	context = child->bts;
-	if (!context)
-		context = alloc_bts_context(child);
-	if (!context)
-		return -ENOMEM;
-
-	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-		if (!cfg.signal)
-			return -EINVAL;
-
-		return -EOPNOTSUPP;
-		context->bts_ovfl_signal = cfg.signal;
-	}
-
-	ds_release_bts(context->tracer);
-	context->tracer = NULL;
-
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
-		int err;
-
-		free_bts_buffer(context);
-		if (!cfg.size)
-			return 0;
-
-		err = alloc_bts_buffer(context, cfg.size);
-		if (err < 0)
-			return err;
-	}
-
-	if (cfg.flags & PTRACE_BTS_O_TRACE)
-		flags |= BTS_USER;
-
-	if (cfg.flags & PTRACE_BTS_O_SCHED)
-		flags |= BTS_TIMESTAMPS;
-
-	context->tracer =
-		ds_request_bts_task(child, context->buffer, context->size,
-				    NULL, (size_t)-1, flags);
-	if (unlikely(IS_ERR(context->tracer))) {
-		int error = PTR_ERR(context->tracer);
-
-		free_bts_buffer(context);
-		context->tracer = NULL;
-		return error;
-	}
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_status(struct task_struct *child,
-			     long cfg_size,
-			     struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct ptrace_bts_config cfg;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(&cfg, 0, sizeof(cfg));
-	cfg.size	= trace->ds.end - trace->ds.begin;
-	cfg.signal	= context->bts_ovfl_signal;
-	cfg.bts_size	= sizeof(struct bts_struct);
-
-	if (cfg.signal)
-		cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
-	if (trace->ds.flags & BTS_USER)
-		cfg.flags |= PTRACE_BTS_O_TRACE;
-
-	if (trace->ds.flags & BTS_TIMESTAMPS)
-		cfg.flags |= PTRACE_BTS_O_SCHED;
-
-	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
-		return -EFAULT;
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_clear(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	return ds_reset_bts(context->tracer);
-}
-
-static int ptrace_bts_size(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
-}
-
-/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
-	if (unlikely(child->bts)) {
-		free_bts_context(child->bts);
-		child->bts = NULL;
-	}
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -1251,39 +910,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
-	/*
-	 * These bits need more cooking - not enabled yet:
-	 */
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-		ret = ptrace_bts_config
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_STATUS:
-		ret = ptrace_bts_status
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_SIZE:
-		ret = ptrace_bts_size(child);
-		break;
-
-	case PTRACE_BTS_GET:
-		ret = ptrace_bts_read_record
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-
-	case PTRACE_BTS_CLEAR:
-		ret = ptrace_bts_clear(child);
-		break;
-
-	case PTRACE_BTS_DRAIN:
-		ret = ptrace_bts_drain
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 	default:
 		ret = ptrace_request(child, request, addr, data);
 		break;
@@ -1543,14 +1169,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 	case PTRACE_GET_THREAD_AREA:
 	case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-	case PTRACE_BTS_STATUS:
-	case PTRACE_BTS_SIZE:
-	case PTRACE_BTS_GET:
-	case PTRACE_BTS_CLEAR:
-	case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
 		return arch_ptrace(child, request, addr, data);
 
 	default:
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff10..7beba0769a8 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,22 +157,6 @@ static int enable_single_step(struct task_struct *child)
 	return 1;
 }
 
-/*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
-{
-	if (child->thread.debugctlmsr == val)
-		return;
-
-	child->thread.debugctlmsr = val;
-
-	if (child != current)
-		return;
-
-	update_debugctlmsr(val);
-}
-
 /*
  * Enable single or block step.
  */
@@ -185,17 +169,9 @@ static void enable_step(struct task_struct *child, bool block)
 	 * So noone should try to use debugger block stepping in a program
 	 * that uses user-mode single stepping itself.
 	 */
-	if (enable_single_step(child) && block) {
-		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
-	} else {
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-		if (!child->thread.debugctlmsr)
-			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	}
+	if (!enable_single_step(child))
+		return;
+	/* XXX */
 }
 
 void user_enable_single_step(struct task_struct *child)
@@ -213,11 +189,7 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	write_debugctlmsr(child,
-			  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-	if (!child->thread.debugctlmsr)
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+	/* XXX */
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..e3da5d726a3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -543,11 +543,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
-	/*
-	 * The processor cleared BTF, so don't mark that we need it set.
-	 */
-	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
-	tsk->thread.debugctlmsr = 0;
 
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 01e6adea07e..cc12b3c556b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -504,18 +504,6 @@ extern int ftrace_dump_on_oops;
 #define INIT_TRACE_RECURSION
 #endif
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-
-void trace_hw_branch(u64 from, u64 to);
-void trace_hw_branch_oops(void);
-
-#else /* CONFIG_HW_BRANCH_TRACER */
-
-static inline void trace_hw_branch(u64 from, u64 to) {}
-static inline void trace_hw_branch_oops(void) {}
-
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_FTRACE_SYSCALLS
 
 unsigned long arch_syscall_addr(int nr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e70f21beb4b..c8442b65511 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,7 +19,6 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
-struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1449,9 +1448,6 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-				 size_t size);
-extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e1fb6072997..4272521e29e 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -345,18 +345,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk,
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 
-#ifndef arch_ptrace_untrace
-/*
- * Do machine-specific work before untracing child.
- *
- * This is called for a normal detach as well as from ptrace_exit()
- * when the tracing task dies.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-#define arch_ptrace_untrace(task)		do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf..e0447c64af6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,7 +99,6 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio_list;
 struct fs_struct;
-struct bts_context;
 struct perf_event_context;
 
 /*
@@ -1272,12 +1271,6 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
-	/*
-	 * This is the tracer handle for the ptrace BTS extension.
-	 * This field actually belongs to the ptracer task.
-	 */
-	struct bts_context *bts;
-
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
@@ -2123,10 +2116,8 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 4799c5f0e6d..d67f1dbfbe0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1108,9 +1108,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.do_batch = 0;
 	p->memcg_batch.memcg = NULL;
 #endif
-
-	p->bts = NULL;
-
 	p->stack_start = stack_start;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a..9fb51237b18 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -76,7 +76,6 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
-	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 9ab3cd7858d..117b7cad31b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2076,49 +2076,6 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	return 1;
 }
 
-/*
- * wait_task_context_switch -	wait for a thread to complete at least one
- *				context switch.
- *
- * @p must not be current.
- */
-void wait_task_context_switch(struct task_struct *p)
-{
-	unsigned long nvcsw, nivcsw, flags;
-	int running;
-	struct rq *rq;
-
-	nvcsw	= p->nvcsw;
-	nivcsw	= p->nivcsw;
-	for (;;) {
-		/*
-		 * The runqueue is assigned before the actual context
-		 * switch. We need to take the runqueue lock.
-		 *
-		 * We could check initially without the lock but it is
-		 * very likely that we need to take the lock in every
-		 * iteration.
-		 */
-		rq = task_rq_lock(p, &flags);
-		running = task_running(rq, p);
-		task_rq_unlock(rq, &flags);
-
-		if (likely(!running))
-			break;
-		/*
-		 * The switch count is incremented before the actual
-		 * context switch. We thus wait for two switches to be
-		 * sure at least one completed.
-		 */
-		if ((p->nvcsw - nvcsw) > 1)
-			break;
-		if ((p->nivcsw - nivcsw) > 1)
-			break;
-
-		cpu_relax();
-	}
-}
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd..8b1797c4545 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
 	help
 	  See Documentation/trace/ftrace-design.txt
 
-config HAVE_HW_BRANCH_TRACER
-	bool
-
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
@@ -374,14 +371,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config HW_BRANCH_TRACER
-	depends on HAVE_HW_BRANCH_TRACER
-	bool "Trace hw branches"
-	select GENERIC_TRACER
-	help
-	  This tracer records all branches on the system in a circular
-	  buffer, giving access to the last N branches for each cpu.
-
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc649003..ffb1a5b0550 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b1..bec2c973ff0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_HW_BRANCHES,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_BLK,
@@ -229,7 +228,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
@@ -467,8 +465,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
-					      struct trace_array *tr);
 extern int trace_selftest_startup_ksym(struct tracer *trace,
 					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399d..dc008c1240d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(hw_branch, hw_branch_entry,
-
-	TRACE_HW_BRANCHES,
-
-	F_STRUCT(
-		__field(	u64,	from	)
-		__field(	u64,	to	)
-	),
-
-	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
-);
-
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
 
 	TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f..00000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-
-#include <asm/ds.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-
-#define BTS_BUFFER_SIZE (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-
-#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
-
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-
-
-static void bts_trace_init_cpu(int cpu)
-{
-	per_cpu(hwb_tracer, cpu) =
-		ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-				   BTS_BUFFER_SIZE, NULL, (size_t)-1,
-				   BTS_KERNEL);
-
-	if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-		per_cpu(hwb_tracer, cpu) = NULL;
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
-	int cpu;
-
-	hw_branch_trace = tr;
-	trace_hw_branches_enabled = 0;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		bts_trace_init_cpu(cpu);
-
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			trace_hw_branches_enabled = 1;
-	}
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-
-	/* If we could not enable tracing on a single cpu, we fail. */
-	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-	trace_hw_branches_enabled = 0;
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 1;
-	put_online_cpus();
-}
-
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
-				     unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		/* The notification is sent with interrupts enabled. */
-		if (trace_hw_branches_enabled) {
-			bts_trace_init_cpu(cpu);
-
-			if (trace_hw_branches_suspended &&
-			    likely(per_cpu(hwb_tracer, cpu)))
-				ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-		}
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/* The notification is sent with interrupts enabled. */
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
-	.notifier_call = bts_hotcpu_handler
-};
-
-static void bts_trace_print_header(struct seq_file *m)
-{
-	seq_puts(m, "# CPU#        TO  <-  FROM\n");
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *seq = &iter->seq;
-	struct hw_branch_entry *it;
-
-	trace_assign_type(it, entry);
-
-	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
-		    seq_print_ip_sym(seq, it->to, symflags) &&
-		    trace_seq_printf(seq, "\t  <-  ") &&
-		    seq_print_ip_sym(seq, it->from, symflags) &&
-		    trace_seq_printf(seq, "\n"))
-			return TRACE_TYPE_HANDLED;
-		return TRACE_TYPE_PARTIAL_LINE;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_hw_branch(u64 from, u64 to)
-{
-	struct ftrace_event_call *call = &event_hw_branch;
-	struct trace_array *tr = hw_branch_trace;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buf;
-	struct hw_branch_entry *entry;
-	unsigned long irq1;
-	int cpu;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(!trace_hw_branches_enabled))
-		return;
-
-	local_irq_save(irq1);
-	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-		goto out;
-
-	buf = tr->buffer;
-	event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->from = from;
-	entry->to   = to;
-	if (!filter_check_discard(call, entry, buf, event))
-		trace_buffer_unlock_commit(buf, event, 0, 0);
-
- out:
-	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(irq1);
-}
-
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
-	struct bts_struct bts;
-	int err = 0;
-
-	WARN_ON_ONCE(!trace->read);
-	if (!trace->read)
-		return;
-
-	err = trace->read(this_tracer, at, &bts);
-	if (err < 0)
-		return;
-
-	switch (bts.qualifier) {
-	case BTS_BRANCH:
-		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
-		break;
-	}
-}
-
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
-	struct trace_array *tr = (struct trace_array *)arg;
-	const struct bts_trace *trace;
-	unsigned char *at;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
-		return;
-
-	if (unlikely(!this_tracer))
-		return;
-
-	trace = ds_read_bts(this_tracer);
-	if (!trace)
-		return;
-
-	for (at = trace->ds.top; (void *)at < trace->ds.end;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	/*
-	 * We need to collect the trace on the respective cpu since ftrace
-	 * implicitly adds the record for the current cpu.
-	 * Once that is more flexible, we could collect the data from any cpu.
-	 */
-	on_each_cpu(trace_bts_cpu, iter->tr, 1);
-
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	put_online_cpus();
-}
-
-static void trace_bts_close(struct trace_iterator *iter)
-{
-	tracing_reset_online_cpus(iter->tr);
-}
-
-void trace_hw_branch_oops(void)
-{
-	if (this_tracer) {
-		ds_suspend_bts_noirq(this_tracer);
-		trace_bts_cpu(hw_branch_trace);
-		ds_resume_bts_noirq(this_tracer);
-	}
-}
-
-struct tracer bts_tracer __read_mostly =
-{
-	.name		= "hw-branch-tracer",
-	.init		= bts_trace_init,
-	.reset		= bts_trace_reset,
-	.print_header	= bts_trace_print_header,
-	.print_line	= bts_trace_print_line,
-	.start		= bts_trace_start,
-	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare,
-	.close		= trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-
-__init static int init_bts_trace(void)
-{
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d6..a7084e7c042 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,7 +16,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-	case TRACE_HW_BRANCHES:
 	case TRACE_KSYM:
 		return 1;
 	}
@@ -754,62 +753,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
-				   struct trace_array *tr)
-{
-	struct trace_iterator *iter;
-	struct tracer tracer;
-	unsigned long count;
-	int ret;
-
-	if (!trace->open) {
-		printk(KERN_CONT "missing open function...");
-		return -1;
-	}
-
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/*
-	 * The hw-branch tracer needs to collect the trace from the various
-	 * cpu trace buffers - before tracing is stopped.
-	 */
-	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
-
-	memcpy(&tracer, trace, sizeof(tracer));
-
-	iter->trace = &tracer;
-	iter->tr = tr;
-	iter->pos = -1;
-	mutex_init(&iter->mutex);
-
-	trace->open(iter);
-
-	mutex_destroy(&iter->mutex);
-	kfree(iter);
-
-	tracing_stop();
-
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT "no entries found..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_KSYM_TRACER
 static int ksym_selftest_dummy;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec..3f82720e051 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	spin_unlock(&shmlock_user_lock);
 	free_uid(user);
 }
-
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-			  size_t size)
-{
-	unsigned long lim, vm, pgsz;
-	int error = -ENOMEM;
-
-	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->total_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->locked_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	mm->total_vm  += pgsz;
-	mm->locked_vm += pgsz;
-
-	error = 0;
- out:
-	up_write(&mm->mmap_sem);
-	return error;
-}
-
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
-	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	mm->total_vm  -= pgsz;
-	mm->locked_vm -= pgsz;
-
-	up_write(&mm->mmap_sem);
-}
-- 
cgit v1.2.3-70-g09d2


From ea8e61b7bbc4a2faef77db34eb2db2a2c2372ff6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Mar 2010 14:51:51 +0100
Subject: x86, ptrace: Fix block-step

Implement ptrace-block-step using TIF_BLOCKSTEP which will set
DEBUGCTLMSR_BTF when set for a task while preserving any other
DEBUGCTLMSR bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100325135414.017536066@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h   |  4 ++--
 arch/x86/include/asm/thread_info.h |  4 +++-
 arch/x86/kernel/kprobes.c          | 14 ++++++++++++--
 arch/x86/kernel/process.c          | 11 +++++++++++
 arch/x86/kernel/step.c             | 24 ++++++++++++++++++++----
 arch/x86/kernel/traps.c            |  5 +++++
 6 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5bec21a66dc..32428b410b5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -799,7 +799,7 @@ extern void cpu_init(void);
 
 static inline unsigned long get_debugctlmsr(void)
 {
-    unsigned long debugctlmsr = 0;
+	unsigned long debugctlmsr = 0;
 
 #ifndef CONFIG_X86_DEBUGCTLMSR
 	if (boot_cpu_data.x86 < 6)
@@ -807,7 +807,7 @@ static inline unsigned long get_debugctlmsr(void)
 #endif
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 
-    return debugctlmsr;
+	return debugctlmsr;
 }
 
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index dc85e12d140..d017ed5502e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,6 +92,7 @@ struct thread_info {
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
+#define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 
@@ -113,6 +114,7 @@ struct thread_info {
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
+#define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 
@@ -143,7 +145,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7a880ad3a20..f2f56c0967b 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,12 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 static void __kprobes clear_btf(void)
 {
-	/* XXX */
+	if (test_thread_flag(TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+	}
 }
 
 static void __kprobes restore_btf(void)
 {
-	/* XXX */
+	if (test_thread_flag(TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl |= DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+	}
 }
 
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1a60beb32ed..8328009416d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -195,6 +195,17 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
+	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
+	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+			debugctl |= DEBUGCTLMSR_BTF;
+
+		update_debugctlmsr(debugctl);
+	}
+
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 7beba0769a8..58de45ee08b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -169,9 +169,19 @@ static void enable_step(struct task_struct *child, bool block)
 	 * So noone should try to use debugger block stepping in a program
 	 * that uses user-mode single stepping itself.
 	 */
-	if (!enable_single_step(child))
-		return;
-	/* XXX */
+	if (enable_single_step(child) && block) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl |= DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		set_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	} else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	}
 }
 
 void user_enable_single_step(struct task_struct *child)
@@ -189,7 +199,13 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	/* XXX */
+	if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	}
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e3da5d726a3..36f1bd9f8e7 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -544,6 +544,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
 
+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
-- 
cgit v1.2.3-70-g09d2


From 11164cd4f6dab326a88bdf27f2f8f7c11977e91a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Mar 2010 14:08:44 +0100
Subject: perf, x86: Add Nehelem PMU programming errata workaround

Implement the workaround for Intel Errata AAK100 and AAP53.

Also, remove the Core-i7 name for Nehalem events since there are
also Westmere based i7 chips.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <1269608924.12097.147.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  8 +++----
 arch/x86/kernel/cpu/perf_event_intel.c | 43 ++++++++++++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event_p4.c    |  2 +-
 arch/x86/kernel/cpu/perf_event_p6.c    |  2 +-
 4 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f571f514de2..6f66d4a845f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -184,7 +184,7 @@ struct x86_pmu {
 	int		version;
 	int		(*handle_irq)(struct pt_regs *);
 	void		(*disable_all)(void);
-	void		(*enable_all)(void);
+	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
 	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
@@ -576,7 +576,7 @@ void hw_perf_disable(void)
 	x86_pmu.disable_all();
 }
 
-static void x86_pmu_enable_all(void)
+static void x86_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
@@ -784,7 +784,7 @@ void hw_perf_enable(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
-	int i;
+	int i, added = cpuc->n_added;
 
 	if (!x86_pmu_initialized())
 		return;
@@ -836,7 +836,7 @@ void hw_perf_enable(void)
 	cpuc->enabled = 1;
 	barrier();
 
-	x86_pmu.enable_all();
+	x86_pmu.enable_all(added);
 }
 
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 044b8436b19..676aac27aca 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -483,7 +483,7 @@ static void intel_pmu_disable_all(void)
 	intel_pmu_lbr_disable_all();
 }
 
-static void intel_pmu_enable_all(void)
+static void intel_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -502,6 +502,40 @@ static void intel_pmu_enable_all(void)
 	}
 }
 
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *
+ * These chips need to be 'reset' when adding counters by programming
+ * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
+ * either in sequence on the same PMC or on different PMCs.
+ */
+static void intel_pmu_nhm_enable_all(int added)
+{
+	if (added) {
+		struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+		int i;
+
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+		for (i = 0; i < 3; i++) {
+			struct perf_event *event = cpuc->events[i];
+
+			if (!event)
+				continue;
+
+			__x86_pmu_enable_event(&event->hw);
+		}
+	}
+	intel_pmu_enable_all(added);
+}
+
 static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
@@ -658,7 +692,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	intel_pmu_drain_bts_buffer();
 	status = intel_pmu_get_status();
 	if (!status) {
-		intel_pmu_enable_all();
+		intel_pmu_enable_all(0);
 		return 0;
 	}
 
@@ -707,7 +741,7 @@ again:
 		goto again;
 
 done:
-	intel_pmu_enable_all();
+	intel_pmu_enable_all(0);
 	return 1;
 }
 
@@ -920,7 +954,8 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
-		pr_cont("Nehalem/Corei7 events, ");
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		pr_cont("Nehalem events, ");
 		break;
 
 	case 28: /* Atom */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index f8fe069f14e..0d1be36cbe9 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -535,7 +535,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
 				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
-static void p4_pmu_enable_all(void)
+static void p4_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 6ff4d01d880..877182c850d 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -66,7 +66,7 @@ static void p6_pmu_disable_all(void)
 	wrmsrl(MSR_P6_EVNTSEL0, val);
 }
 
-static void p6_pmu_enable_all(void)
+static void p6_pmu_enable_all(int added)
 {
 	unsigned long val;
 
-- 
cgit v1.2.3-70-g09d2


From 948b1bb89a44561560531394c18da4a99215f772 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 29 Mar 2010 18:36:50 +0200
Subject: perf, x86: Undo some some *_counter* -> *_event* renames

The big rename:

 cdd6c48 perf: Do the big rename: Performance Counters -> Performance Events

accidentally renamed some members of stucts that were named after
registers in the spec. To avoid confusion this patch reverts some
changes. The related specs are MSR descriptions in AMD's BKDGs and the
ARCHITECTURAL PERFORMANCE MONITORING section in the Intel 64 and IA-32
Architectures Software Developer's Manuals.

This patch does:

 $ sed -i -e 's:num_events:num_counters:g' \
   arch/x86/include/asm/perf_event.h \
   arch/x86/kernel/cpu/perf_event_amd.c \
   arch/x86/kernel/cpu/perf_event.c \
   arch/x86/kernel/cpu/perf_event_intel.c \
   arch/x86/kernel/cpu/perf_event_p6.c \
   arch/x86/kernel/cpu/perf_event_p4.c \
   arch/x86/oprofile/op_model_ppro.c

 $ sed -i -e 's:event_bits:cntval_bits:g' -e 's:event_mask:cntval_mask:g' \
   arch/x86/kernel/cpu/perf_event_amd.c \
   arch/x86/kernel/cpu/perf_event.c \
   arch/x86/kernel/cpu/perf_event_intel.c \
   arch/x86/kernel/cpu/perf_event_p6.c \
   arch/x86/kernel/cpu/perf_event_p4.c

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269880612-25800-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      |  4 +-
 arch/x86/kernel/cpu/perf_event.c       | 74 +++++++++++++++++-----------------
 arch/x86/kernel/cpu/perf_event_amd.c   | 12 +++---
 arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++----
 arch/x86/kernel/cpu/perf_event_p4.c    | 14 +++----
 arch/x86/kernel/cpu/perf_event_p6.c    |  6 +--
 arch/x86/oprofile/op_model_ppro.c      |  4 +-
 7 files changed, 65 insertions(+), 65 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 124dddd598f..987bf673141 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -67,7 +67,7 @@
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
-		unsigned int num_events:8;
+		unsigned int num_counters:8;
 		unsigned int bit_width:8;
 		unsigned int mask_length:8;
 	} split;
@@ -76,7 +76,7 @@ union cpuid10_eax {
 
 union cpuid10_edx {
 	struct {
-		unsigned int num_events_fixed:4;
+		unsigned int num_counters_fixed:4;
 		unsigned int reserved:28;
 	} split;
 	unsigned int full;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5343566181..9daaa1ef504 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -195,10 +195,10 @@ struct x86_pmu {
 	u64		(*event_map)(int);
 	u64		(*raw_event)(u64);
 	int		max_events;
-	int		num_events;
-	int		num_events_fixed;
-	int		event_bits;
-	u64		event_mask;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		cntval_bits;
+	u64		cntval_mask;
 	int		apic;
 	u64		max_period;
 	struct event_constraint *
@@ -268,7 +268,7 @@ static u64
 x86_perf_event_update(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	int shift = 64 - x86_pmu.event_bits;
+	int shift = 64 - x86_pmu.cntval_bits;
 	u64 prev_raw_count, new_raw_count;
 	int idx = hwc->idx;
 	s64 delta;
@@ -320,12 +320,12 @@ static bool reserve_pmc_hardware(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		disable_lapic_nmi_watchdog();
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
@@ -336,7 +336,7 @@ eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 
-	i = x86_pmu.num_events;
+	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
@@ -352,7 +352,7 @@ static void release_pmc_hardware(void)
 {
 	int i;
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		release_perfctr_nmi(x86_pmu.perfctr + i);
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
@@ -547,7 +547,7 @@ static void x86_pmu_disable_all(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -582,7 +582,7 @@ static void x86_pmu_enable_all(int added)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		u64 val;
 
@@ -657,14 +657,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	 * assign events to counters starting with most
 	 * constrained events.
 	 */
-	wmax = x86_pmu.num_events;
+	wmax = x86_pmu.num_counters;
 
 	/*
 	 * when fixed event counters are present,
 	 * wmax is incremented by 1 to account
 	 * for one more choice
 	 */
-	if (x86_pmu.num_events_fixed)
+	if (x86_pmu.num_counters_fixed)
 		wmax++;
 
 	for (w = 1, num = n; num && w <= wmax; w++) {
@@ -714,7 +714,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
 	struct perf_event *event;
 	int n, max_count;
 
-	max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
 
 	/* current number of events already accepted */
 	n = cpuc->n_events;
@@ -904,7 +904,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
 	wrmsrl(hwc->event_base + idx,
-			(u64)(-left) & x86_pmu.event_mask);
+			(u64)(-left) & x86_pmu.cntval_mask);
 
 	perf_event_update_userpage(event);
 
@@ -987,7 +987,7 @@ void perf_event_print_debug(void)
 	unsigned long flags;
 	int cpu, idx;
 
-	if (!x86_pmu.num_events)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_save(flags);
@@ -1011,7 +1011,7 @@ void perf_event_print_debug(void)
 	}
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
@@ -1024,7 +1024,7 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
-	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1089,7 +1089,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -1097,7 +1097,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		hwc = &event->hw;
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
 			continue;
 
 		/*
@@ -1401,46 +1401,46 @@ void __init init_hw_perf_events(void)
 	if (x86_pmu.quirks)
 		x86_pmu.quirks();
 
-	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
-		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 	}
-	x86_pmu.intel_ctrl = (1 << x86_pmu.num_events) - 1;
-	perf_max_events = x86_pmu.num_events;
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+	perf_max_events = x86_pmu.num_counters;
 
-	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
-		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 	}
 
 	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
 
 	unconstrained = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
-				   0, x86_pmu.num_events);
+		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+				   0, x86_pmu.num_counters);
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
 			if (c->cmask != INTEL_ARCH_FIXED_MASK)
 				continue;
 
-			c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
-			c->weight += x86_pmu.num_events;
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
 		}
 	}
 
 	pr_info("... version:                %d\n",     x86_pmu.version);
-	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
-	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
-	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
+	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
+	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	perf_cpu_notifier(x86_pmu_notifier);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 285623bc3cc..7753a5c7653 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -165,7 +165,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
 	 * be removed on one CPU at a time AND PMU is disabled
 	 * when we come here
 	 */
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (nb->owners[i] == event) {
 			cmpxchg(nb->owners+i, event, NULL);
 			break;
@@ -215,7 +215,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	struct amd_nb *nb = cpuc->amd_nb;
 	struct perf_event *old = NULL;
-	int max = x86_pmu.num_events;
+	int max = x86_pmu.num_counters;
 	int i, j, k = -1;
 
 	/*
@@ -293,7 +293,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
 	/*
 	 * initialize all possible NB constraints
 	 */
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		__set_bit(i, nb->event_constraints[i].idxmsk);
 		nb->event_constraints[i].weight = 1;
 	}
@@ -385,9 +385,9 @@ static __initconst struct x86_pmu amd_pmu = {
 	.event_map		= amd_pmu_event_map,
 	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_events		= 4,
-	.event_bits		= 48,
-	.event_mask		= (1ULL << 48) - 1,
+	.num_counters		= 4,
+	.cntval_bits		= 48,
+	.cntval_mask		= (1ULL << 48) - 1,
 	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 676aac27aca..cc4d90a13d5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -653,20 +653,20 @@ static void intel_pmu_reset(void)
 	unsigned long flags;
 	int idx;
 
-	if (!x86_pmu.num_events)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_save(flags);
 
 	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
 		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
 	}
-	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
 		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-	}
+
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
 
@@ -901,16 +901,16 @@ static __init int intel_pmu_init(void)
 		x86_pmu = intel_pmu;
 
 	x86_pmu.version			= version;
-	x86_pmu.num_events		= eax.split.num_events;
-	x86_pmu.event_bits		= eax.split.bit_width;
-	x86_pmu.event_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
 	 * assume at least 3 events:
 	 */
 	if (version > 1)
-		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
 
 	/*
 	 * v2 and above have a perf capabilities MSR
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 0d1be36cbe9..4139100404e 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -483,7 +483,7 @@ static void p4_pmu_disable_all(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -540,7 +540,7 @@ static void p4_pmu_enable_all(int added)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -562,7 +562,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -579,7 +579,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
 			continue;
 
 		/*
@@ -794,10 +794,10 @@ static __initconst struct x86_pmu p4_pmu = {
 	 * though leave it restricted at moment assuming
 	 * HT is on
 	 */
-	.num_events		= ARCH_P4_MAX_CCCR,
+	.num_counters		= ARCH_P4_MAX_CCCR,
 	.apic			= 1,
-	.event_bits		= 40,
-	.event_mask		= (1ULL << 40) - 1,
+	.cntval_bits		= 40,
+	.cntval_mask		= (1ULL << 40) - 1,
 	.max_period		= (1ULL << 39) - 1,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 877182c850d..b26fbc7eb93 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -119,7 +119,7 @@ static __initconst struct x86_pmu p6_pmu = {
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
 	.version		= 0,
-	.num_events		= 2,
+	.num_counters		= 2,
 	/*
 	 * Events have 40 bits implemented. However they are designed such
 	 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -127,8 +127,8 @@ static __initconst struct x86_pmu p6_pmu = {
 	 *
 	 * See IA-32 Intel Architecture Software developer manual Vol 3B
 	 */
-	.event_bits		= 32,
-	.event_mask		= (1ULL << 32) - 1,
+	.cntval_bits		= 32,
+	.cntval_mask		= (1ULL << 32) - 1,
 	.get_event_constraints	= x86_get_event_constraints,
 	.event_constraints	= p6_event_constraints,
 };
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b..c8abc4d1bf3 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -239,11 +239,11 @@ static void arch_perfmon_setup_counters(void)
 	if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
 		current_cpu_data.x86_model == 15) {
 		eax.split.version_id = 2;
-		eax.split.num_events = 2;
+		eax.split.num_counters = 2;
 		eax.split.bit_width = 40;
 	}
 
-	num_counters = eax.split.num_events;
+	num_counters = eax.split.num_counters;
 
 	op_arch_perfmon_spec.num_counters = num_counters;
 	op_arch_perfmon_spec.num_controls = num_counters;
-- 
cgit v1.2.3-70-g09d2


From a098f4484bc7dae23f5b62360954007b99b64600 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 30 Mar 2010 11:28:21 +0200
Subject: perf, x86: implement ARCH_PERFMON_EVENTSEL bit masks

ARCH_PERFMON_EVENTSEL bit masks are often used in the kernel. This
patch adds macros for the bit masks and removes local defines. The
function intel_pmu_raw_event() becomes x86_pmu_raw_event() which is
generic for x86 models and same also for p6. Duplicate code is
removed.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100330092821.GH11907@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      | 58 +++++++++++++++-------------------
 arch/x86/kernel/cpu/perf_event.c       | 19 +++++++++--
 arch/x86/kernel/cpu/perf_event_amd.c   | 15 +--------
 arch/x86/kernel/cpu/perf_event_intel.c | 22 ++-----------
 arch/x86/kernel/cpu/perf_event_p6.c    | 20 +-----------
 5 files changed, 45 insertions(+), 89 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 987bf673141..f6d43dbfd8e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -18,39 +18,31 @@
 #define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
 #define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
 
-#define ARCH_PERFMON_EVENTSEL_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_ANY			  (1 << 21)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-
-
-#define INTEL_ARCH_EVTSEL_MASK		0x000000FFULL
-#define INTEL_ARCH_UNIT_MASK		0x0000FF00ULL
-#define INTEL_ARCH_EDGE_MASK		0x00040000ULL
-#define INTEL_ARCH_INV_MASK		0x00800000ULL
-#define INTEL_ARCH_CNT_MASK		0xFF000000ULL
-#define INTEL_ARCH_EVENT_MASK	(INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
-
-/*
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define INTEL_ARCH_FIXED_MASK \
-	(INTEL_ARCH_CNT_MASK| \
-	 INTEL_ARCH_INV_MASK| \
-	 INTEL_ARCH_EDGE_MASK|\
-	 INTEL_ARCH_UNIT_MASK|\
-	 INTEL_ARCH_EVENT_MASK)
+#define ARCH_PERFMON_EVENTSEL_EVENT			0x000000FFULL
+#define ARCH_PERFMON_EVENTSEL_UMASK			0x0000FF00ULL
+#define ARCH_PERFMON_EVENTSEL_USR			(1ULL << 16)
+#define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_EDGE			(1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_INT			(1ULL << 20)
+#define ARCH_PERFMON_EVENTSEL_ANY			(1ULL << 21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
+#define ARCH_PERFMON_EVENTSEL_INV			(1ULL << 23)
+#define ARCH_PERFMON_EVENTSEL_CMASK			0xFF000000ULL
+
+#define AMD64_EVENTSEL_EVENT	\
+	(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
+#define INTEL_ARCH_EVENT_MASK	\
+	(ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
+
+#define X86_RAW_EVENT_MASK		\
+	(ARCH_PERFMON_EVENTSEL_EVENT |	\
+	 ARCH_PERFMON_EVENTSEL_UMASK |	\
+	 ARCH_PERFMON_EVENTSEL_EDGE  |	\
+	 ARCH_PERFMON_EVENTSEL_INV   |	\
+	 ARCH_PERFMON_EVENTSEL_CMASK)
+#define AMD64_RAW_EVENT_MASK		\
+	(X86_RAW_EVENT_MASK          |  \
+	 AMD64_EVENTSEL_EVENT)
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9daaa1ef504..1dd42c18f1c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -143,13 +143,21 @@ struct cpu_hw_events {
  * Constraint on the Event code.
  */
 #define INTEL_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
 
 /*
  * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
  */
 #define FIXED_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
 
 /*
  * Constraint on the Event code + UMask
@@ -437,6 +445,11 @@ static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc
 	return 0;
 }
 
+static u64 x86_pmu_raw_event(u64 hw_event)
+{
+	return hw_event & X86_RAW_EVENT_MASK;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -1427,7 +1440,7 @@ void __init init_hw_perf_events(void)
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if (c->cmask != INTEL_ARCH_FIXED_MASK)
+			if (c->cmask != X86_RAW_EVENT_MASK)
 				continue;
 
 			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 7753a5c7653..37e9517729d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -113,20 +113,7 @@ static u64 amd_pmu_event_map(int hw_event)
 
 static u64 amd_pmu_raw_event(u64 hw_event)
 {
-#define K7_EVNTSEL_EVENT_MASK	0xF000000FFULL
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
-#define K7_EVNTSEL_INV_MASK	0x000800000ULL
-#define K7_EVNTSEL_REG_MASK	0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK			\
-	(K7_EVNTSEL_EVENT_MASK |	\
-	 K7_EVNTSEL_UNIT_MASK  |	\
-	 K7_EVNTSEL_EDGE_MASK  |	\
-	 K7_EVNTSEL_INV_MASK   |	\
-	 K7_EVNTSEL_REG_MASK)
-
-	return hw_event & K7_EVNTSEL_MASK;
+	return hw_event & AMD64_RAW_EVENT_MASK;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index cc4d90a13d5..dfdd6f90fc8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -452,24 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
  },
 };
 
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK		0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK		\
-	(INTEL_ARCH_EVTSEL_MASK |	\
-	 INTEL_ARCH_UNIT_MASK   |	\
-	 INTEL_ARCH_EDGE_MASK   |	\
-	 INTEL_ARCH_INV_MASK    |	\
-	 INTEL_ARCH_CNT_MASK)
-
-	return hw_event & CORE_EVNTSEL_MASK;
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -788,7 +770,7 @@ static __initconst struct x86_pmu core_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
@@ -827,7 +809,7 @@ static __initconst struct x86_pmu intel_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index b26fbc7eb93..03c139a67ba 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
  */
 #define P6_NOP_EVENT			0x0000002EULL
 
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define P6_EVNTSEL_INV_MASK		0x00800000ULL
-#define P6_EVNTSEL_REG_MASK		0xFF000000ULL
-
-#define P6_EVNTSEL_MASK			\
-	(P6_EVNTSEL_EVENT_MASK |	\
-	 P6_EVNTSEL_UNIT_MASK  |	\
-	 P6_EVNTSEL_EDGE_MASK  |	\
-	 P6_EVNTSEL_INV_MASK   |	\
-	 P6_EVNTSEL_REG_MASK)
-
-	return hw_event & P6_EVNTSEL_MASK;
-}
-
 static struct event_constraint p6_event_constraints[] =
 {
 	INTEL_EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
@@ -114,7 +96,7 @@ static __initconst struct x86_pmu p6_pmu = {
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-	.raw_event		= p6_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
-- 
cgit v1.2.3-70-g09d2


From b4cdc5c264b35c67007800dec3928e9547a9d70b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 30 Mar 2010 17:00:06 +0200
Subject: perf, x86: Fix up the ANY flag stuff

Stephane noticed that the ANY flag was in generic arch code, and Cyrill
reported that it broke the P4 code.

Solve this by merging x86_pmu::raw_event into x86_pmu::hw_config and
provide intel_pmu and amd_pmu specific versions of this callback.

The intel_pmu one deals with the ANY flag, the amd_pmu adds the few extra
event bits AMD64 has.

Reported-by: Stephane Eranian <eranian@google.com>
Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Robert Richter <robert.richter@amd.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269968113.5258.442.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 35 ++++++++++-------------------
 arch/x86/kernel/cpu/perf_event_amd.c   | 17 +++++++++++----
 arch/x86/kernel/cpu/perf_event_intel.c | 30 +++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event_p4.c    | 40 +++++++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_p6.c    |  3 +--
 5 files changed, 74 insertions(+), 51 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1dd42c18f1c..65e9c5efb61 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -196,12 +196,11 @@ struct x86_pmu {
 	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
-	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
+	int		(*hw_config)(struct perf_event *event);
 	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
-	u64		(*raw_event)(u64);
 	int		max_events;
 	int		num_counters;
 	int		num_counters_fixed;
@@ -426,28 +425,26 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
-static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+static int x86_pmu_hw_config(struct perf_event *event)
 {
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
 	 * Count user and OS events unless requested not to
 	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!event->attr.exclude_kernel)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	return 0;
-}
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
-static u64 x86_pmu_raw_event(u64 hw_event)
-{
-	return hw_event & X86_RAW_EVENT_MASK;
+	return 0;
 }
 
 /*
@@ -489,7 +486,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	hwc->last_tag = ~0ULL;
 
 	/* Processor specifics */
-	err = x86_pmu.hw_config(attr, hwc);
+	err = x86_pmu.hw_config(event);
 	if (err)
 		return err;
 
@@ -508,16 +505,8 @@ static int __hw_perf_event_init(struct perf_event *event)
 			return -EOPNOTSUPP;
 	}
 
-	/*
-	 * Raw hw_event type provide the config in the hw_event structure
-	 */
-	if (attr->type == PERF_TYPE_RAW) {
-		hwc->config |= x86_pmu.raw_event(attr->config);
-		if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
-		    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
+	if (attr->type == PERF_TYPE_RAW)
 		return 0;
-	}
 
 	if (attr->type == PERF_TYPE_HW_CACHE)
 		return set_ext_hw_attr(hwc, attr);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 37e9517729d..bbd7339f08a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -111,9 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
 	return amd_perfmon_event_map[hw_event];
 }
 
-static u64 amd_pmu_raw_event(u64 hw_event)
+static int amd_pmu_hw_config(struct perf_event *event)
 {
-	return hw_event & AMD64_RAW_EVENT_MASK;
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+	return 0;
 }
 
 /*
@@ -365,12 +375,11 @@ static __initconst struct x86_pmu amd_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= amd_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
-	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 	.num_counters		= 4,
 	.cntval_bits		= 48,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index dfdd6f90fc8..30bf10c55f1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -758,6 +758,30 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	return x86_get_event_constraints(cpuc, event);
 }
 
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+		return 0;
+
+	if (x86_pmu.version < 3)
+		return -EINVAL;
+
+	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+	return 0;
+}
+
 static __initconst struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -765,12 +789,11 @@ static __initconst struct x86_pmu core_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= x86_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
@@ -804,12 +827,11 @@ static __initconst struct x86_pmu intel_pmu = {
 	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_event,
 	.disable		= intel_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= intel_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 4139100404e..acd237d29f1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -419,20 +419,7 @@ static u64 p4_pmu_event_map(int hw_event)
 	return config;
 }
 
-/*
- * We don't control raw events so it's up to the caller
- * to pass sane values (and we don't count the thread number
- * on HT machine but allow HT-compatible specifics to be
- * passed on)
- */
-static u64 p4_pmu_raw_event(u64 hw_event)
-{
-	return hw_event &
-		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
-}
-
-static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = raw_smp_processor_id();
 	u32 escr, cccr;
@@ -444,11 +431,29 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	 */
 
 	cccr = p4_default_cccr_conf(cpu);
-	escr = p4_default_escr_conf(cpu, attr->exclude_kernel, attr->exclude_user);
-	hwc->config = p4_config_pack_escr(escr) | p4_config_pack_cccr(cccr);
+	escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+					 event->attr.exclude_user);
+	event->hw.config = p4_config_pack_escr(escr) |
+			   p4_config_pack_cccr(cccr);
 
 	if (p4_ht_active() && p4_ht_thread(cpu))
-		hwc->config = p4_set_ht_bit(hwc->config);
+		event->hw.config = p4_set_ht_bit(event->hw.config);
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	/*
+	 * We don't control raw events so it's up to the caller
+	 * to pass sane values (and we don't count the thread number
+	 * on HT machine but allow HT-compatible specifics to be
+	 * passed on)
+	 *
+	 * XXX: HT wide things should check perf_paranoid_cpu() &&
+	 *      CAP_SYS_ADMIN
+	 */
+	event->hw.config |= event->attr.config &
+		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
+		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 
 	return 0;
 }
@@ -785,7 +790,6 @@ static __initconst struct x86_pmu p4_pmu = {
 	.eventsel		= MSR_P4_BPU_CCCR0,
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
-	.raw_event		= p4_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p4_general_events),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 03c139a67ba..9123e8ec995 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -91,12 +91,11 @@ static __initconst struct x86_pmu p6_pmu = {
 	.enable_all		= p6_pmu_enable_all,
 	.enable			= p6_pmu_enable_event,
 	.disable		= p6_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= x86_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
-- 
cgit v1.2.3-70-g09d2


From caaa8be3b6707cb9664e573a28b00f845ce9f32e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 29 Mar 2010 13:09:53 +0200
Subject: perf, x86: Fix __initconst vs const

All variables that have __initconst should also be const.

Suggested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd.c   |  4 ++--
 arch/x86/kernel/cpu/perf_event_intel.c | 12 ++++++------
 arch/x86/kernel/cpu/perf_event_p4.c    |  4 ++--
 arch/x86/kernel/cpu/perf_event_p6.c    |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index bbd7339f08a..611df11ba15 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
 
 static DEFINE_RAW_SPINLOCK(amd_nb_lock);
 
-static __initconst u64 amd_hw_cache_event_ids
+static __initconst const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -368,7 +368,7 @@ static void amd_pmu_cpu_dead(int cpu)
 	raw_spin_unlock(&amd_nb_lock);
 }
 
-static __initconst struct x86_pmu amd_pmu = {
+static __initconst const struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 30bf10c55f1..1957e3f14c0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
-static __initconst u64 westmere_hw_cache_event_ids
+static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
  },
 };
 
-static __initconst u64 nehalem_hw_cache_event_ids
+static __initconst const u64 nehalem_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
  },
 };
 
-static __initconst u64 core2_hw_cache_event_ids
+static __initconst const u64 core2_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
  },
 };
 
-static __initconst u64 atom_hw_cache_event_ids
+static __initconst const u64 atom_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -782,7 +782,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	return 0;
 }
 
-static __initconst struct x86_pmu core_pmu = {
+static __initconst const struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
@@ -820,7 +820,7 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
-static __initconst struct x86_pmu intel_pmu = {
+static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
 	.disable_all		= intel_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index acd237d29f1..15367cce66b 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -287,7 +287,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
 	p4_config_pack_cccr(cache_event					| \
 			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
-static __initconst u64 p4_hw_cache_event_ids
+static __initconst const u64 p4_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -780,7 +780,7 @@ done:
 	return num ? -ENOSPC : 0;
 }
 
-static __initconst struct x86_pmu p4_pmu = {
+static __initconst const struct x86_pmu p4_pmu = {
 	.name			= "Netburst P4/Xeon",
 	.handle_irq		= p4_pmu_handle_irq,
 	.disable_all		= p4_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 9123e8ec995..34ba07be2cd 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
 }
 
-static __initconst struct x86_pmu p6_pmu = {
+static __initconst const struct x86_pmu p6_pmu = {
 	.name			= "p6",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= p6_pmu_disable_all,
-- 
cgit v1.2.3-70-g09d2


From 40b91cd10f000b4c4934e48e2e5c0bec66def144 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 29 Mar 2010 16:37:17 +0200
Subject: perf, x86: Add Nehalem programming quirk to Westmere

According to the Xeon-5600 errata the Westmere suffers the same PMU
programming bug as the original Nehalem did.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 1957e3f14c0..f168b4030d4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -488,6 +488,7 @@ static void intel_pmu_enable_all(int added)
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
  *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
  *
  * These chips need to be 'reset' when adding counters by programming
  * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
@@ -980,6 +981,7 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		pr_cont("Westmere events, ");
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From 6f4dee06fbf0133917f3d76fa3fb50e18b10c1f5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 18 Mar 2010 23:47:01 +0100
Subject: perf: Drop the frame reliablity check

It is useless now that we have a pure stack frame
walker, as given addr are always reliable.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 65e9c5efb61..353a174adb4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1602,8 +1602,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	if (reliable)
-		callchain_store(entry, addr);
+	callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops backtrace_ops = {
-- 
cgit v1.2.3-70-g09d2


From ab285f2b5290d92b7ec1a6f9aad54308dadf6157 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 8 Apr 2010 14:05:50 +0200
Subject: perf: Fix unsafe frame rewinding with hot regs fetching

When we fetch the hot regs and rewind to the nth caller, it
might happen that we dereference a frame pointer outside the
kernel stack boundaries, like in this example:

	perf_trace_sched_switch+0xd5/0x120
        schedule+0x6b5/0x860
        retint_careful+0xd/0x21

Since we directly dereference a userspace frame pointer here while
rewinding behind retint_careful, this may end up in a crash.

Fix this by simply using probe_kernel_address() when we rewind the
frame pointer.

This issue will have a much more proper fix in the next version of the
perf_arch_fetch_caller_regs() API that will only need to rewind to the
first caller.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Archs <linux-arch@vger.kernel.org>
---
 arch/x86/kernel/dumpstack.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index e39e77168a3..e1a93be4fd4 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,6 +14,8 @@
 #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
 #endif
 
+#include <linux/uaccess.h>
+
 extern void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -42,8 +44,10 @@ static inline unsigned long rewind_frame_pointer(int n)
 	get_bp(frame);
 
 #ifdef CONFIG_FRAME_POINTER
-	while (n--)
-		frame = frame->next_frame;
+	while (n--) {
+		if (probe_kernel_address(&frame->next_frame, frame))
+			break;
+	}
 #endif
 
 	return (unsigned long)frame;
-- 
cgit v1.2.3-70-g09d2


From 5958f1d5d722df7a9e5d129676614a8e5219bacd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:41 +0200
Subject: x86, cpu: Add AMD core boosting feature flag to /proc/cpuinfo

By semi-popular demand, this adds the Core Performance Boost feature
flag to /proc/cpuinfo. Possible use case for this is userspace tools
like cpufreq-aperf, for example, so that they don't have to jump through
hoops of accessing "/dev/cpu/%d/cpuid" in order to check for CPB hw
support, or call cpuid from userspace.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-2-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cpufeature.h          | 1 +
 arch/x86/kernel/cpu/addon_cpuid_features.c | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0cd82d06861..630e623f61e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -161,6 +161,7 @@
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
+#define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 
 /* Virtualization flags: Linux defined */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf68..ead2a1cfa57 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,8 +30,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
 		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
 		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
 		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-- 
cgit v1.2.3-70-g09d2


From 73860c6b2fd159a35637e233d735e36887c266ad Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:42 +0200
Subject: powernow-k8: Add core performance boost support

Starting with F10h, revE, AMD processors add support for a dynamic
core boosting feature called Core Performance Boost. When a specific
condition is present, a subset of the cores on a system are boosted
beyond their P0 operating frequency to speed up the performance of
single-threaded applications.

In the normal case, the system comes out of reset with core boosting
enabled. This patch adds a sysfs knob with which core boosting can be
switched on or off for benchmarking purposes.

While at it, make the CPB code hotplug-aware so that taking cores
offline wouldn't interfere with boosting the remaining online cores.
Furthermore, add cpu_online_mask hotplug protection as suggested by
Andrew.

Finally, cleanup the driver init codepath and update copyrights.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-3-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 161 ++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/cpufreq/powernow-k8.h |   2 -
 2 files changed, 151 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d360b56e982..74ca34b5c00 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
-
 /*
- *   (c) 2003-2006 Advanced Micro Devices, Inc.
+ *   (c) 2003-2010 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -54,6 +53,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 
 static int cpu_family = CPU_OPTERON;
 
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1393,8 +1396,77 @@ out:
 	return khz;
 }
 
+static void _cpb_toggle_msrs(bool t)
+{
+	int cpu;
+
+	get_online_cpus();
+
+	rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	for_each_cpu(cpu, cpu_online_mask) {
+		struct msr *reg = per_cpu_ptr(msrs, cpu);
+		if (t)
+			reg->l &= ~BIT(25);
+		else
+			reg->l |= BIT(25);
+	}
+	wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	put_online_cpus();
+}
+
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+	if (!cpb_capable)
+		return;
+
+	if (t && !cpb_enabled) {
+		cpb_enabled = true;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting enabled.\n");
+	} else if (!t && cpb_enabled) {
+		cpb_enabled = false;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting disabled.\n");
+	}
+}
+
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+				 size_t count)
+{
+	int ret = -EINVAL;
+	unsigned long val = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (!ret && (val == 0 || val == 1) && cpb_capable)
+		cpb_toggle(val);
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+	return sprintf(buf, "%u\n", cpb_enabled);
+}
+
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_one_rw(cpb);
+
 static struct freq_attr *powernow_k8_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
+	&cpb,
 	NULL,
 };
 
@@ -1410,10 +1482,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
 	.attr		= powernow_k8_attr,
 };
 
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
+				void *hcpu)
+{
+	unsigned cpu = (long)hcpu;
+	u32 lo, hi;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+
+		if (!cpb_enabled) {
+			rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+			lo |= BIT(25);
+			wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		}
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+		lo &= ~BIT(25);
+		wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpb_nb = {
+	.notifier_call		= cpb_notify,
+};
+
 /* driver entry point for init */
 static int __cpuinit powernowk8_init(void)
 {
-	unsigned int i, supported_cpus = 0;
+	unsigned int i, supported_cpus = 0, cpu;
 
 	for_each_online_cpu(i) {
 		int rc;
@@ -1422,15 +1535,36 @@ static int __cpuinit powernowk8_init(void)
 			supported_cpus++;
 	}
 
-	if (supported_cpus == num_online_cpus()) {
-		printk(KERN_INFO PFX "Found %d %s "
-			"processors (%d cpu cores) (" VERSION ")\n",
-			num_online_nodes(),
-			boot_cpu_data.x86_model_id, supported_cpus);
-		return cpufreq_register_driver(&cpufreq_amd64_driver);
+	if (supported_cpus != num_online_cpus())
+		return -ENODEV;
+
+	printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
+		num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
+
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+
+		cpb_capable = true;
+
+		register_cpu_notifier(&cpb_nb);
+
+		msrs = msrs_alloc();
+		if (!msrs) {
+			printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+			return -ENOMEM;
+		}
+
+		rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			struct msr *reg = per_cpu_ptr(msrs, cpu);
+			cpb_enabled |= !(!!(reg->l & BIT(25)));
+		}
+
+		printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+			(cpb_enabled ? "on" : "off"));
 	}
 
-	return -ENODEV;
+	return cpufreq_register_driver(&cpufreq_amd64_driver);
 }
 
 /* driver entry point for term */
@@ -1438,6 +1572,13 @@ static void __exit powernowk8_exit(void)
 {
 	dprintk("exit\n");
 
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+		msrs_free(msrs);
+		msrs = NULL;
+
+		unregister_cpu_notifier(&cpb_nb);
+	}
+
 	cpufreq_unregister_driver(&cpufreq_amd64_driver);
 }
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073c..df3529b1c02 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
  *  http://www.gnu.org/licenses/gpl.html
  */
 
-
 enum pstate {
 	HW_PSTATE_INVALID = 0xff,
 	HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
 	struct cpumask *available_cores;
 };
 
-
 /* processor's cpuid instruction support */
 #define CPUID_PROCESSOR_SIGNATURE	1	/* function 1 */
 #define CPUID_XFAM			0x0ff00000	/* extended family */
-- 
cgit v1.2.3-70-g09d2


From d65ad45cd82a0db9544469b8c54f5dc5cafbb2d8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:43 +0200
Subject: x86: Unify APERF/MPERF support

Initialize this CPUID flag feature in common code. It could be made a
standalone function later, maybe, if more functionality is duplicated.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 8 ++++++++
 arch/x86/kernel/cpu/intel.c                | 6 ------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ead2a1cfa57..fd1fc1902a4 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -54,6 +54,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
+
+	/*
+	 * common AMD/Intel features
+	 */
+	if (c->cpuid_level >= 6) {
+		if (cpuid_ecx(6) & 0x1)
+			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
+	}
 }
 
 /* leaf 0xb SMT level */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af3..3830258a5f5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -352,12 +352,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
-	if (c->cpuid_level > 6) {
-		unsigned ecx = cpuid_ecx(6);
-		if (ecx & 0x01)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
-
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 	if (cpu_has_ds) {
-- 
cgit v1.2.3-70-g09d2


From a2fed573f065e526bfd5cbf26e5491973d9e9aaa Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 18 Mar 2010 18:41:46 +0100
Subject: x86, cpufreq: Add APERF/MPERF support for AMD processors

Starting with model 10 of Family 0x10, AMD processors may have
support for APERF/MPERF.  Add support for identifying it and using
it within cpufreq.  Move the APERF/MPERF functions out of the
acpi-cpufreq code and into their own file so they can easily be
shared.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20100401141956.GA1930@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/Makefile       |  4 +--
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 44 ++------------------------
 arch/x86/kernel/cpu/cpufreq/mperf.c        | 51 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/cpufreq/mperf.h        |  9 ++++++
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c  |  8 +++++
 5 files changed, 72 insertions(+), 44 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.c
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170..bd54bf67e6f 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
 # K8 systems. ACPI is preferred to all other hardware-specific drivers.
 # speedstep-* is preferred over p4-clockmod.
 
-obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o
+obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
 obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
 obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c8..dc68e5c2c07 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -45,6 +45,7 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
+#include "mperf.h"
 
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
 		"acpi-cpufreq", msg)
@@ -70,8 +71,6 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
 
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -239,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
 	return cmd.val;
 }
 
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-	struct aperfmperf *am = _cur;
-
-	get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
-				      unsigned int cpu)
-{
-	struct aperfmperf perf;
-	unsigned long ratio;
-	unsigned int retval;
-
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-		return 0;
-
-	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-	per_cpu(acfreq_old_perf, cpu) = perf;
-
-	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
-	return retval;
-}
-
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
 	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -701,7 +661,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* Check for APERF/MPERF support in hardware */
 	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		acpi_cpufreq_driver.getavg = get_measured_perf;
+		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
 
 	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
 	for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 00000000000..911e193018a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+
+#include "mperf.h"
+
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+	struct aperfmperf *am = _cur;
+
+	get_aperfmperf(am);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu)
+{
+	struct aperfmperf perf;
+	unsigned long ratio;
+	unsigned int retval;
+
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+		return 0;
+
+	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+	per_cpu(acfreq_old_perf, cpu) = perf;
+
+	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 00000000000..5dbf2950dc2
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ *  (c) 2010 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 74ca34b5c00..52fce638f44 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -45,6 +45,7 @@
 #define PFX "powernow-k8: "
 #define VERSION "version 2.20.00"
 #include "powernow-k8.h"
+#include "mperf.h"
 
 /* serialize freq changes  */
 static DEFINE_MUTEX(fidvid_mutex);
@@ -57,6 +58,8 @@ static int cpu_family = CPU_OPTERON;
 static bool cpb_capable, cpb_enabled;
 static struct msr __percpu *msrs;
 
+static struct cpufreq_driver cpufreq_amd64_driver;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1251,6 +1254,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	struct powernow_k8_data *data;
 	struct init_on_cpu init_on_cpu;
 	int rc;
+	struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
 
 	if (!cpu_online(pol->cpu))
 		return -ENODEV;
@@ -1325,6 +1329,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		return -EINVAL;
 	}
 
+	/* Check for APERF/MPERF support in hardware */
+	if (cpu_has(c, X86_FEATURE_APERFMPERF))
+		cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
+
 	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
 
 	if (cpu_family == CPU_HW_PSTATE)
-- 
cgit v1.2.3-70-g09d2


From 679370641e3675633cad222449262abbe93a4a2a Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Wed, 31 Mar 2010 21:56:45 +0200
Subject: powernow-k8: Fix frequency reporting

With F10, model 10, all valid frequencies are in the ACPI _PST table.

Cc: <stable@kernel.org> # 33.x 32.x
Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <1270065406-1814-6-git-send-email-bp@amd64.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 52fce638f44..6f3dc8fbbfd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -935,7 +935,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 		powernow_table[i].index = index;
 
 		/* Frequency may be rounded for these */
-		if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
+				 || boot_cpu_data.x86 == 0x11) {
 			powernow_table[i].frequency =
 				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
 		} else
-- 
cgit v1.2.3-70-g09d2


From 3f10940e4fb69d312602078f2c5234206797ca31 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@relay.de.ibm.com>
Date: Sat, 10 Apr 2010 16:46:21 +0200
Subject: x86/microcode: Use nonseekable_open()

No need to seek on this file, so prevent it outright so we can
avoid using default_llseek - removes one more BKL usage.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[drop useless llseek = no_llseek and smp_lock.h inclusion]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnd Bergmann <arnd@relay.de.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
LKML-Reference: <1270910781-8786-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c..2cd8c544e41 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
 	return error;
 }
 
-static int microcode_open(struct inode *unused1, struct file *unused2)
+static int microcode_open(struct inode *inode, struct file *file)
 {
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
 }
 
 static ssize_t microcode_write(struct file *file, const char __user *buf,
-- 
cgit v1.2.3-70-g09d2


From b8f7fb13d2d7ff14818fd1d3edd8b834d38b0217 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 14 Apr 2010 11:35:46 -0500
Subject: x86, UV: Improve BAU performance and error recovery

- increase performance of the interrupt handler

- release timed-out software acknowledge resources

- recover from continuous-busy status due to a hardware issue

- add a 'throttle' to keep a uvhub from sending more than a
  specified number of broadcasts concurrently (work around the hardware issue)

- provide a 'nobau' boot command line option

- rename 'pnode' and 'node' to 'uvhub' (the 'node' terminology
  is ambiguous)

- add some new statistics about the scope of broadcasts, retries, the
  hardware issue and the 'throttle'

- split off new function uv_bau_retry_msg() from
  uv_bau_process_message() per community coding style feedback.

- simplify the argument list to uv_bau_process_message(), per
  community coding style feedback.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: linux-mm@kvack.org
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <E1O25Z4-0004Ur-PB@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  247 +++++---
 arch/x86/kernel/tlb_uv.c         | 1270 +++++++++++++++++++++++++++-----------
 2 files changed, 1075 insertions(+), 442 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index b414d2b401f..aa558ac0306 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -27,13 +27,14 @@
  * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
  *
  * We will use 31 sets, one for sending BAU messages from each of the 32
- * cpu's on the node.
+ * cpu's on the uvhub.
  *
  * TLB shootdown will use the first of the 8 descriptors of each set.
  * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
  */
 
 #define UV_ITEMS_PER_DESCRIPTOR		8
+#define MAX_BAU_CONCURRENT		3
 #define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
 #define UV_ACT_STATUS_SIZE		2
@@ -45,6 +46,9 @@
 #define UV_PAYLOADQ_PNODE_SHIFT		49
 #define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
 #define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
+#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
 
 /*
  * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
@@ -55,15 +59,29 @@
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
 /*
- * source side thresholds at which message retries print a warning
+ * source side threshholds at which message retries print a warning
  */
 #define SOURCE_TIMEOUT_LIMIT		20
 #define DESTINATION_TIMEOUT_LIMIT	20
 
+/*
+ * misc. delays, in microseconds
+ */
+#define THROTTLE_DELAY			10
+#define TIMEOUT_DELAY			10
+#define BIOS_TO				1000
+/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
+
+/*
+ * threshholds at which to use IPI to free resources
+ */
+#define PLUGSB4RESET 100
+#define TIMEOUTSB4RESET 100
+
 /*
  * number of entries in the destination side payload queue
  */
-#define DEST_Q_SIZE			17
+#define DEST_Q_SIZE			20
 /*
  * number of destination side software ack resources
  */
@@ -72,9 +90,10 @@
 /*
  * completion statuses for sending a TLB flush message
  */
-#define	FLUSH_RETRY			1
-#define	FLUSH_GIVEUP			2
-#define	FLUSH_COMPLETE			3
+#define FLUSH_RETRY_PLUGGED		1
+#define FLUSH_RETRY_TIMEOUT		2
+#define FLUSH_GIVEUP			3
+#define FLUSH_COMPLETE			4
 
 /*
  * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -86,14 +105,14 @@
  * 'base_dest_nodeid' field of the header corresponds to the
  * destination nodeID associated with that specified bit.
  */
-struct bau_target_nodemask {
-	unsigned long bits[BITS_TO_LONGS(256)];
+struct bau_target_uvhubmask {
+	unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
 };
 
 /*
- * mask of cpu's on a node
+ * mask of cpu's on a uvhub
  * (during initialization we need to check that unsigned long has
- *  enough bits for max. cpu's per node)
+ *  enough bits for max. cpu's per uvhub)
  */
 struct bau_local_cpumask {
 	unsigned long bits;
@@ -135,8 +154,8 @@ struct bau_msg_payload {
 struct bau_msg_header {
 	unsigned int dest_subnodeid:6;	/* must be 0x10, for the LB */
 	/* bits 5:0 */
-	unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
-	/* bits 20:6 */			  /* first bit in node_map */
+	unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */
+	/* bits 20:6 */			  /* first bit in uvhub map */
 	unsigned int command:8;	/* message type */
 	/* bits 28:21 */
 				/* 0x38: SN3net EndPoint Message */
@@ -146,26 +165,38 @@ struct bau_msg_header {
 	unsigned int rsvd_2:9;	/* must be zero */
 	/* bits 40:32 */
 				/* Suppl_A is 56-41 */
-	unsigned int payload_2a:8;/* becomes byte 16 of msg */
-	/* bits 48:41 */	/* not currently using */
-	unsigned int payload_2b:8;/* becomes byte 17 of msg */
-	/* bits 56:49 */	/* not currently using */
+	unsigned int sequence:16;/* message sequence number */
+	/* bits 56:41 */	/* becomes bytes 16-17 of msg */
 				/* Address field (96:57) is never used as an
 				   address (these are address bits 42:3) */
+
 	unsigned int rsvd_3:1;	/* must be zero */
 	/* bit 57 */
 				/* address bits 27:4 are payload */
-				/* these 24 bits become bytes 12-14 of msg */
+	/* these next 24  (58-81) bits become bytes 12-14 of msg */
+
+	/* bits 65:58 land in byte 12 */
 	unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
 	/* bit 58 */
-
-	unsigned int payload_1a:5;/* not currently used */
-	/* bits 63:59 */
-	unsigned int payload_1b:8;/* not currently used */
-	/* bits 71:64 */
-	unsigned int payload_1c:8;/* not currently used */
-	/* bits 79:72 */
-	unsigned int payload_1d:2;/* not currently used */
+	unsigned int msg_type:3; /* software type of the message*/
+	/* bits 61:59 */
+	unsigned int canceled:1; /* message canceled, resource to be freed*/
+	/* bit 62 */
+	unsigned int payload_1a:1;/* not currently used */
+	/* bit 63 */
+	unsigned int payload_1b:2;/* not currently used */
+	/* bits 65:64 */
+
+	/* bits 73:66 land in byte 13 */
+	unsigned int payload_1ca:6;/* not currently used */
+	/* bits 71:66 */
+	unsigned int payload_1c:2;/* not currently used */
+	/* bits 73:72 */
+
+	/* bits 81:74 land in byte 14 */
+	unsigned int payload_1d:6;/* not currently used */
+	/* bits 79:74 */
+	unsigned int payload_1e:2;/* not currently used */
 	/* bits 81:80 */
 
 	unsigned int rsvd_4:7;	/* must be zero */
@@ -178,7 +209,7 @@ struct bau_msg_header {
 	/* bits 95:90 */
 	unsigned int rsvd_6:5;	/* must be zero */
 	/* bits 100:96 */
-	unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
+	unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */
 	/* bit 101*/
 	unsigned int fairness:3;/* usually zero */
 	/* bits 104:102 */
@@ -191,13 +222,18 @@ struct bau_msg_header {
 	/* bits 127:107 */
 };
 
+/* see msg_type: */
+#define MSG_NOOP 0
+#define MSG_REGULAR 1
+#define MSG_RETRY 2
+
 /*
  * The activation descriptor:
  * The format of the message to send, plus all accompanying control
  * Should be 64 bytes
  */
 struct bau_desc {
-	struct bau_target_nodemask distribution;
+	struct bau_target_uvhubmask distribution;
 	/*
 	 * message template, consisting of header and payload:
 	 */
@@ -237,19 +273,25 @@ struct bau_payload_queue_entry {
 	unsigned short acknowledge_count; /* filled in by destination */
 	/* 16 bits, bytes 10-11 */
 
-	unsigned short replied_to:1;	/* sent as 0 by the source */
-	/* 1 bit */
-	unsigned short unused1:7;       /* not currently using */
-	/* 7 bits: byte 12) */
+	/* these next 3 bytes come from bits 58-81 of the message header */
+	unsigned short replied_to:1;    /* sent as 0 by the source */
+	unsigned short msg_type:3;      /* software message type */
+	unsigned short canceled:1;      /* sent as 0 by the source */
+	unsigned short unused1:3;       /* not currently using */
+	/* byte 12 */
 
-	unsigned char unused2[2];	/* not currently using */
-	/* bytes 13-14 */
+	unsigned char unused2a;		/* not currently using */
+	/* byte 13 */
+	unsigned char unused2;		/* not currently using */
+	/* byte 14 */
 
 	unsigned char sw_ack_vector;	/* filled in by the hardware */
 	/* byte 15 (bits 127:120) */
 
-	unsigned char unused4[3];	/* not currently using bytes 17-19 */
-	/* bytes 17-19 */
+	unsigned short sequence;	/* message sequence number */
+	/* bytes 16-17 */
+	unsigned char unused4[2];	/* not currently using bytes 18-19 */
+	/* bytes 18-19 */
 
 	int number_of_cpus;		/* filled in at destination */
 	/* 32 bits, bytes 20-23 (aligned) */
@@ -259,63 +301,93 @@ struct bau_payload_queue_entry {
 };
 
 /*
- * one for every slot in the destination payload queue
- */
-struct bau_msg_status {
-	struct bau_local_cpumask seen_by;	/* map of cpu's */
-};
-
-/*
- * one for every slot in the destination software ack resources
- */
-struct bau_sw_ack_status {
-	struct bau_payload_queue_entry *msg;	/* associated message */
-	int watcher;				/* cpu monitoring, or -1 */
-};
-
-/*
- * one on every node and per-cpu; to locate the software tables
+ * one per-cpu; to locate the software tables
  */
 struct bau_control {
 	struct bau_desc *descriptor_base;
-	struct bau_payload_queue_entry *bau_msg_head;
 	struct bau_payload_queue_entry *va_queue_first;
 	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_msg_status *msg_statuses;
-	int *watching; /* pointer to array */
+	struct bau_payload_queue_entry *bau_msg_head;
+	struct bau_control *uvhub_master;
+	struct bau_control *socket_master;
+	unsigned long timeout_interval;
+	atomic_t active_descriptor_count;
+	int max_concurrent;
+	int max_concurrent_constant;
+	int retry_message_scans;
+	int plugged_tries;
+	int timeout_tries;
+	int ipi_attempts;
+	int conseccompletes;
+	short cpu;
+	short uvhub_cpu;
+	short uvhub;
+	short cpus_in_socket;
+	short cpus_in_uvhub;
+	unsigned short message_number;
+	unsigned short uvhub_quiesce;
+	short socket_acknowledge_count[DEST_Q_SIZE];
+	cycles_t send_message;
+	spinlock_t masks_lock;
+	spinlock_t uvhub_lock;
+	spinlock_t queue_lock;
 };
 
 /*
  * This structure is allocated per_cpu for UV TLB shootdown statistics.
  */
 struct ptc_stats {
-	unsigned long ptc_i;	/* number of IPI-style flushes */
-	unsigned long requestor;	/* number of nodes this cpu sent to */
-	unsigned long requestee;	/* times cpu was remotely requested */
-	unsigned long alltlb;	/* times all tlb's on this cpu were flushed */
-	unsigned long onetlb;	/* times just one tlb on this cpu was flushed */
-	unsigned long s_retry;	/* retries on source side timeouts */
-	unsigned long d_retry;	/* retries on destination side timeouts */
-	unsigned long sflush;	/* cycles spent in uv_flush_tlb_others */
-	unsigned long dflush;	/* cycles spent on destination side */
-	unsigned long retriesok; /* successes on retries */
-	unsigned long nomsg;	/* interrupts with no message */
-	unsigned long multmsg;	/* interrupts with multiple messages */
-	unsigned long ntargeted;/* nodes targeted */
+	/* sender statistics */
+	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
+	unsigned long s_requestor; /* number of shootdown requests */
+	unsigned long s_stimeout; /* source side timeouts */
+	unsigned long s_dtimeout; /* destination side timeouts */
+	unsigned long s_time; /* time spent in sending side */
+	unsigned long s_retriesok; /* successful retries */
+	unsigned long s_ntargcpu; /* number of cpus targeted */
+	unsigned long s_ntarguvhub; /* number of uvhubs targeted */
+	unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
+	unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
+	unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
+	unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
+	unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
+	unsigned long s_resets_plug; /* ipi-style resets from plug state */
+	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
+	unsigned long s_busy; /* status stayed busy past s/w timer */
+	unsigned long s_throttles; /* waits in throttle */
+	unsigned long s_retry_messages; /* retry broadcasts */
+	/* destination statistics */
+	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
+	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
+	unsigned long d_multmsg; /* interrupts with multiple messages */
+	unsigned long d_nomsg; /* interrupts with no message */
+	unsigned long d_time; /* time spent on destination side */
+	unsigned long d_requestee; /* number of messages processed */
+	unsigned long d_retries; /* number of retry messages processed */
+	unsigned long d_canceled; /* number of messages canceled by retries */
+	unsigned long d_nocanceled; /* retries that found nothing to cancel */
+	unsigned long d_resets; /* number of ipi-style requests processed */
+	unsigned long d_rcanceled; /* number of messages canceled by resets */
 };
 
-static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
+static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
 {
-	return constant_test_bit(node, &dstp->bits[0]);
+	return constant_test_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
+static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
 {
-	__set_bit(node, &dstp->bits[0]);
+	__set_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
+static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
+				    int nbits)
 {
 	bitmap_zero(&dstp->bits[0], nbits);
 }
+static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp)
+{
+	return bitmap_weight((unsigned long *)&dstp->bits[0],
+				UV_DISTRIBUTION_SIZE);
+}
 
 static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 {
@@ -328,4 +400,35 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 extern void uv_bau_message_intr1(void);
 extern void uv_bau_timeout_intr1(void);
 
+struct atomic_short {
+	short counter;
+};
+
+/**
+ * atomic_read_short - read a short atomic variable
+ * @v: pointer of type atomic_short
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read_short(const struct atomic_short *v)
+{
+	return v->counter;
+}
+
+/**
+ * atomic_add_short_return - add and return a short int
+ * @i: short value to add
+ * @v: pointer of type atomic_short
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_short_return(short i, struct atomic_short *v)
+{
+	short __i = i;
+	asm volatile(LOCK_PREFIX "xaddw %0, %1"
+			: "+r" (i), "+m" (v->counter)
+			: : "memory");
+	return i + __i;
+}
+
 #endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ef68ba48564..414f7c4fe76 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  *	SGI UltraViolet TLB flush routines.
  *
- *	(c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
  *
  *	This code is released under the GNU General Public License version 2 or
  *	later.
@@ -19,44 +19,67 @@
 #include <asm/idle.h>
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
+#include <asm/timer.h>
+
+struct msg_desc {
+	struct bau_payload_queue_entry *msg;
+	int msg_slot;
+	int sw_ack_slot;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
+};
 
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
 
-static struct bau_control	**uv_bau_table_bases __read_mostly;
-static int			uv_bau_retry_limit __read_mostly;
+static int uv_bau_max_concurrent __read_mostly;
 
-/* base pnode in this partition */
-static int			uv_partition_base_pnode __read_mostly;
+static int nobau;
+static int __init setup_nobau(char *arg)
+{
+	nobau = 1;
+	return 0;
+}
+early_param("nobau", setup_nobau);
 
-static unsigned long		uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
 
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+struct reset_args {
+	int sender;
+};
 
 /*
- * Determine the first node on a blade.
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
  */
-static int __init blade_to_first_node(int blade)
+static int __init uvhub_to_first_node(int uvhub)
 {
 	int node, b;
 
 	for_each_online_node(node) {
 		b = uv_node_to_blade_id(node);
-		if (blade == b)
+		if (uvhub == b)
 			return node;
 	}
-	return -1; /* shouldn't happen */
+	return -1;
 }
 
 /*
- * Determine the apicid of the first cpu on a blade.
+ * Determine the apicid of the first cpu on a uvhub.
  */
-static int __init blade_to_first_apicid(int blade)
+static int __init uvhub_to_first_apicid(int uvhub)
 {
 	int cpu;
 
 	for_each_present_cpu(cpu)
-		if (blade == uv_cpu_to_blade_id(cpu))
+		if (uvhub == uv_cpu_to_blade_id(cpu))
 			return per_cpu(x86_cpu_to_apicid, cpu);
 	return -1;
 }
@@ -69,195 +92,459 @@ static int __init blade_to_first_apicid(int blade)
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static void uv_reply_to_message(int resource,
-				struct bau_payload_queue_entry *msg,
-				struct bau_msg_status *msp)
+static inline void uv_reply_to_message(struct msg_desc *mdp,
+				       struct bau_control *bcp)
 {
 	unsigned long dw;
+	struct bau_payload_queue_entry *msg;
 
-	dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+	msg = mdp->msg;
+	if (!msg->canceled) {
+		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+						msg->sw_ack_vector;
+		uv_write_local_mmr(
+				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+	}
 	msg->replied_to = 1;
 	msg->sw_ack_vector = 0;
-	if (msp)
-		msp->seen_by.bits = 0;
-	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
 }
 
 /*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
+ * Process the receipt of a RETRY message
  */
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
-				   int msg_slot, int sw_ack_slot)
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
+					    struct bau_control *bcp)
 {
-	unsigned long this_cpu_mask;
-	struct bau_msg_status *msp;
-	int cpu;
+	int i;
+	int cancel_count = 0;
+	int slot2;
+	unsigned long msg_res;
+	unsigned long mmr = 0;
+	struct bau_payload_queue_entry *msg;
+	struct bau_payload_queue_entry *msg2;
+	struct ptc_stats *stat;
 
-	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
-	cpu = uv_blade_processor_id();
-	msg->number_of_cpus =
-		uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
-	this_cpu_mask = 1UL << cpu;
-	if (msp->seen_by.bits & this_cpu_mask)
-		return;
-	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_retries++;
+	/*
+	 * cancel any message from msg+1 to the retry itself
+	 */
+	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+		if (msg2 > mdp->va_queue_last)
+			msg2 = mdp->va_queue_first;
+		if (msg2 == msg)
+			break;
+
+		/* same conditions for cancellation as uv_do_reset */
+		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+			msg->sw_ack_vector) == 0) &&
+		    (msg2->sending_cpu == msg->sending_cpu) &&
+		    (msg2->msg_type != MSG_NOOP)) {
+			slot2 = msg2 - mdp->va_queue_first;
+			mmr = uv_read_local_mmr
+				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg2->sw_ack_vector << 8) |
+				   msg2->sw_ack_vector);
+			/*
+			 * This is a message retry; clear the resources held
+			 * by the previous message only if they timed out.
+			 * If it has not timed out we have an unexpected
+			 * situation to report.
+			 */
+			if (mmr & (msg_res << 8)) {
+				/*
+				 * is the resource timed out?
+				 * make everyone ignore the cancelled message.
+				 */
+				msg2->canceled = 1;
+				stat->d_canceled++;
+				cancel_count++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+					(msg_res << 8) | msg_res);
+			} else
+				printk(KERN_INFO "note bau retry: no effect\n");
+		}
+	}
+	if (!cancel_count)
+		stat->d_nocanceled++;
+}
 
-	if (msg->replied_to == 1)
-		return;
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+				   struct bau_control *bcp)
+{
+	int msg_ack_count;
+	short socket_ack_count = 0;
+	struct ptc_stats *stat;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *smaster = bcp->socket_master;
 
+	/*
+	 * This must be a normal message, or retry of a normal message
+	 */
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
 	if (msg->address == TLB_FLUSH_ALL) {
 		local_flush_tlb();
-		__get_cpu_var(ptcstats).alltlb++;
+		stat->d_alltlb++;
 	} else {
 		__flush_tlb_one(msg->address);
-		__get_cpu_var(ptcstats).onetlb++;
+		stat->d_onetlb++;
 	}
+	stat->d_requestee++;
+
+	/*
+	 * One cpu on each uvhub has the additional job on a RETRY
+	 * of releasing the resource held by the message that is
+	 * being retried.  That message is identified by sending
+	 * cpu number.
+	 */
+	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+		uv_bau_process_retry_msg(mdp, bcp);
 
-	__get_cpu_var(ptcstats).requestee++;
+	/*
+	 * This is a sw_ack message, so we have to reply to it.
+	 * Count each responding cpu on the socket. This avoids
+	 * pinging the count's cache line back and forth between
+	 * the sockets.
+	 */
+	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+			&smaster->socket_acknowledge_count[mdp->msg_slot]);
+	if (socket_ack_count == bcp->cpus_in_socket) {
+		/*
+		 * Both sockets dump their completed count total into
+		 * the message's count.
+		 */
+		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+		msg_ack_count = atomic_add_short_return(socket_ack_count,
+				(struct atomic_short *)&msg->acknowledge_count);
+
+		if (msg_ack_count == bcp->cpus_in_uvhub) {
+			/*
+			 * All cpus in uvhub saw it; reply
+			 */
+			uv_reply_to_message(mdp, bcp);
+		}
+	}
 
-	atomic_inc_short(&msg->acknowledge_count);
-	if (msg->number_of_cpus == msg->acknowledge_count)
-		uv_reply_to_message(sw_ack_slot, msg, msp);
+	return;
 }
 
 /*
- * Examine the payload queue on one distribution node to see
- * which messages have not been seen, and which cpu(s) have not seen them.
+ * Determine the first cpu on a uvhub.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+	int cpu;
+	for_each_present_cpu(cpu)
+		if (uvhub == uv_cpu_to_blade_id(cpu))
+			return cpu;
+	return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
  *
- * Returns the number of cpu's that have not responded.
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
  */
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+static void
+uv_do_reset(void *ptr)
 {
-	struct bau_payload_queue_entry *msg;
-	struct bau_msg_status *msp;
-	int count = 0;
 	int i;
-	int j;
+	int slot;
+	int count = 0;
+	unsigned long mmr;
+	unsigned long msg_res;
+	struct bau_control *bcp;
+	struct reset_args *rap;
+	struct bau_payload_queue_entry *msg;
+	struct ptc_stats *stat;
 
-	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
-	     msg++, i++) {
-		if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
-			msp = bau_tablesp->msg_statuses + i;
-			printk(KERN_DEBUG
-			       "blade %d: address:%#lx %d of %d, not cpu(s): ",
-			       i, msg->address, msg->acknowledge_count,
-			       msg->number_of_cpus);
-			for (j = 0; j < msg->number_of_cpus; j++) {
-				if (!((1L << j) & msp->seen_by.bits)) {
-					count++;
-					printk("%d ", j);
-				}
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	rap = (struct reset_args *)ptr;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_resets++;
+
+	/*
+	 * We're looking for the given sender, and
+	 * will free its sw_ack resource.
+	 * If all cpu's finally responded after the timeout, its
+	 * message 'replied_to' was set.
+	 */
+	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+		/* uv_do_reset: same conditions for cancellation as
+		   uv_bau_process_retry_msg() */
+		if ((msg->replied_to == 0) &&
+		    (msg->canceled == 0) &&
+		    (msg->sending_cpu == rap->sender) &&
+		    (msg->sw_ack_vector) &&
+		    (msg->msg_type != MSG_NOOP)) {
+			/*
+			 * make everyone else ignore this message
+			 */
+			msg->canceled = 1;
+			slot = msg - bcp->va_queue_first;
+			count++;
+			/*
+			 * only reset the resource if it is still pending
+			 */
+			mmr = uv_read_local_mmr
+					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg->sw_ack_vector << 8) |
+						   msg->sw_ack_vector);
+			if (mmr & msg_res) {
+				stat->d_rcanceled++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+							msg_res);
 			}
-			printk("\n");
 		}
 	}
-	return count;
+	return;
 }
 
 /*
- * Examine the payload queue on all the distribution nodes to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
  */
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+			      int sender)
 {
-	int sender;
-	int i;
-	int count = 0;
+	int uvhub;
+	int cpu;
+	cpumask_t mask;
+	struct reset_args reset_args;
 
-	sender = smp_processor_id();
-	for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
-		if (!bau_node_isset(i, distribution))
+	reset_args.sender = sender;
+
+	cpus_clear(mask);
+	/* find a single cpu for each uvhub in this distribution mask */
+	for (uvhub = 0;
+		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+		    uvhub++) {
+		if (!bau_uvhub_isset(uvhub, distribution))
 			continue;
-		count += uv_examine_destination(uv_bau_table_bases[i], sender);
+		/* find a cpu for this uvhub */
+		cpu = uvhub_to_first_cpu(uvhub);
+		cpu_set(cpu, mask);
 	}
-	return count;
+	/* IPI all cpus; Preemption is already disabled */
+	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+	return;
+}
+
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+	unsigned long long ns;
+	unsigned long us;
+	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
+						>> CYC2NS_SCALE_FACTOR;
+	us = ns / 1000;
+	return us;
 }
 
 /*
- * wait for completion of a broadcast message
- *
- * return COMPLETE, RETRY or GIVEUP
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+	atomic_add_short_return(1, (struct atomic_short *)
+		 &hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+	atomic_add_short_return(-1, (struct atomic_short *)
+		&hmaster->uvhub_quiesce);
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
  */
 static int uv_wait_completion(struct bau_desc *bau_desc,
-			      unsigned long mmr_offset, int right_shift)
+	unsigned long mmr_offset, int right_shift, int this_cpu,
+	struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-	int exams = 0;
-	long destination_timeouts = 0;
-	long source_timeouts = 0;
+	int relaxes = 0;
 	unsigned long descriptor_status;
+	unsigned long mmr;
+	unsigned long mask;
+	cycles_t ttime;
+	cycles_t timeout_time;
+	struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+	struct bau_control *hmaster;
 
+	hmaster = bcp->uvhub_master;
+	timeout_time = get_cycles() + bcp->timeout_interval;
+
+	/* spin on the status MMR, waiting for it to go idle */
 	while ((descriptor_status = (((unsigned long)
 		uv_read_local_mmr(mmr_offset) >>
 			right_shift) & UV_ACT_STATUS_MASK)) !=
 			DESC_STATUS_IDLE) {
-		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-			source_timeouts++;
-			if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
-				source_timeouts = 0;
-			__get_cpu_var(ptcstats).s_retry++;
-			return FLUSH_RETRY;
-		}
 		/*
-		 * spin here looking for progress at the destinations
+		 * Our software ack messages may be blocked because there are
+		 * no swack resources available.  As long as none of them
+		 * has timed out hardware will NACK our message and its
+		 * state will stay IDLE.
 		 */
-		if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
-			destination_timeouts++;
-			if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
-				/*
-				 * returns number of cpus not responding
-				 */
-				if (uv_examine_destinations
-				    (&bau_desc->distribution) == 0) {
-					__get_cpu_var(ptcstats).d_retry++;
-					return FLUSH_RETRY;
-				}
-				exams++;
-				if (exams >= uv_bau_retry_limit) {
-					printk(KERN_DEBUG
-					       "uv_flush_tlb_others");
-					printk("giving up on cpu %d\n",
-					       smp_processor_id());
+		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+			stat->s_stimeout++;
+			return FLUSH_GIVEUP;
+		} else if (descriptor_status ==
+					DESC_STATUS_DESTINATION_TIMEOUT) {
+			stat->s_dtimeout++;
+			ttime = get_cycles();
+
+			/*
+			 * Our retries may be blocked by all destination
+			 * swack resources being consumed, and a timeout
+			 * pending.  In that case hardware returns the
+			 * ERROR that looks like a destination timeout.
+			 */
+			if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+				bcp->conseccompletes = 0;
+				return FLUSH_RETRY_PLUGGED;
+			}
+
+			bcp->conseccompletes = 0;
+			return FLUSH_RETRY_TIMEOUT;
+		} else {
+			/*
+			 * descriptor_status is still BUSY
+			 */
+			cpu_relax();
+			relaxes++;
+			if (relaxes >= 10000) {
+				relaxes = 0;
+				if (get_cycles() > timeout_time) {
+					quiesce_local_uvhub(hmaster);
+
+					/* single-thread the register change */
+					spin_lock(&hmaster->masks_lock);
+					mmr = uv_read_local_mmr(mmr_offset);
+					mask = 0UL;
+					mask |= (3UL < right_shift);
+					mask = ~mask;
+					mmr &= mask;
+					uv_write_local_mmr(mmr_offset, mmr);
+					spin_unlock(&hmaster->masks_lock);
+					end_uvhub_quiesce(hmaster);
+					stat->s_busy++;
 					return FLUSH_GIVEUP;
 				}
-				/*
-				 * delays can hang the simulator
-				   udelay(1000);
-				 */
-				destination_timeouts = 0;
 			}
 		}
-		cpu_relax();
 	}
+	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
 }
 
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+	unsigned long ns;
+	cycles_t cyc;
+
+	ns = sec * 1000000000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'.  atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+	spin_lock(lock);
+	if (atomic_read(v) >= u) {
+		spin_unlock(lock);
+		return 0;
+	}
+	atomic_inc(v);
+	spin_unlock(lock);
+	return 1;
+}
+
 /**
  * uv_flush_send_and_wait
  *
- * Send a broadcast and wait for a broadcast message to complete.
+ * Send a broadcast and wait for it to complete.
  *
- * The flush_mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * cpus that are on the local uvhub.
  *
- * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all flushing represented in the mask was done. The mask
+ * is zeroed.
  * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
+ * mask will have some bits still set, representing any cpus on the local
+ * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
  */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
-					     struct bau_desc *bau_desc,
-					     struct cpumask *flush_mask)
+const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
+					     struct cpumask *flush_mask,
+					     struct bau_control *bcp)
 {
-	int completion_status = 0;
 	int right_shift;
-	int tries = 0;
-	int pnode;
+	int uvhub;
 	int bit;
+	int completion_status = 0;
+	int seq_number = 0;
+	long try = 0;
+	int cpu = bcp->uvhub_cpu;
+	int this_cpu = bcp->cpu;
+	int this_uvhub = bcp->uvhub;
 	unsigned long mmr_offset;
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
+	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+	struct bau_control *smaster = bcp->socket_master;
+	struct bau_control *hmaster = bcp->uvhub_master;
+
+	/*
+	 * Spin here while there are hmaster->max_concurrent or more active
+	 * descriptors. This is the per-uvhub 'throttle'.
+	 */
+	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent)) {
+		stat->s_throttles++;
+		do {
+			cpu_relax();
+		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent));
+	}
+
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
 
 	if (cpu < UV_CPUS_PER_ACT_STATUS) {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -269,24 +556,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	}
 	time1 = get_cycles();
 	do {
-		tries++;
+		/*
+		 * Every message from any given cpu gets a unique message
+		 * sequence number. But retries use that same number.
+		 * Our message may have timed out at the destination because
+		 * all sw-ack resources are in use and there is a timeout
+		 * pending there.  In that case, our last send never got
+		 * placed into the queue and we need to persist until it
+		 * does.
+		 *
+		 * Make any retry a type MSG_RETRY so that the destination will
+		 * free any resource held by a previous message from this cpu.
+		 */
+		if (try == 0) {
+			/* use message type set by the caller the first time */
+			seq_number = bcp->message_number++;
+		} else {
+			/* use RETRY type on all the rest; same sequence */
+			bau_desc->header.msg_type = MSG_RETRY;
+			stat->s_retry_messages++;
+		}
+		bau_desc->header.sequence = seq_number;
 		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-			cpu;
+			bcp->uvhub_cpu;
+		bcp->send_message = get_cycles();
+
 		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+		try++;
 		completion_status = uv_wait_completion(bau_desc, mmr_offset,
-					right_shift);
-	} while (completion_status == FLUSH_RETRY);
+			right_shift, this_cpu, bcp, smaster, try);
+
+		if (completion_status == FLUSH_RETRY_PLUGGED) {
+			/*
+			 * Our retries may be blocked by all destination swack
+			 * resources being consumed, and a timeout pending. In
+			 * that case hardware immediately returns the ERROR
+			 * that looks like a destination timeout.
+			 */
+			udelay(TIMEOUT_DELAY);
+			bcp->plugged_tries++;
+			if (bcp->plugged_tries >= PLUGSB4RESET) {
+				bcp->plugged_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+							this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_plug++;
+			}
+		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+			hmaster->max_concurrent = 1;
+			bcp->timeout_tries++;
+			udelay(TIMEOUT_DELAY);
+			if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+				bcp->timeout_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+								this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_timeout++;
+			}
+		}
+		if (bcp->ipi_attempts >= 3) {
+			bcp->ipi_attempts = 0;
+			completion_status = FLUSH_GIVEUP;
+			break;
+		}
+		cpu_relax();
+	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
-	__get_cpu_var(ptcstats).sflush += (time2 - time1);
-	if (tries > 1)
-		__get_cpu_var(ptcstats).retriesok++;
 
-	if (completion_status == FLUSH_GIVEUP) {
+	if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+	    && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+			hmaster->max_concurrent++;
+
+	/*
+	 * hold any cpu not timing out here; no other cpu currently held by
+	 * the 'throttle' should enter the activation code
+	 */
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
+	atomic_dec(&hmaster->active_descriptor_count);
+
+	/* guard against cycles wrap */
+	if (time2 > time1)
+		stat->s_time += (time2 - time1);
+	else
+		stat->s_requestor--; /* don't count this one */
+	if (completion_status == FLUSH_COMPLETE && try > 1)
+		stat->s_retriesok++;
+	else if (completion_status == FLUSH_GIVEUP) {
 		/*
 		 * Cause the caller to do an IPI-style TLB shootdown on
-		 * the cpu's, all of which are still in the mask.
+		 * the target cpu's, all of which are still in the mask.
 		 */
-		__get_cpu_var(ptcstats).ptc_i++;
+		stat->s_giveup++;
 		return flush_mask;
 	}
 
@@ -295,18 +666,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	 * use the IPI method of shootdown on them.
 	 */
 	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		if (pnode == this_pnode)
+		uvhub = uv_cpu_to_blade_id(bit);
+		if (uvhub == this_uvhub)
 			continue;
 		cpumask_clear_cpu(bit, flush_mask);
 	}
 	if (!cpumask_empty(flush_mask))
 		return flush_mask;
+
 	return NULL;
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
 /**
  * uv_flush_tlb_others - globally purge translation cache of a virtual
  * address or all TLB's
@@ -323,8 +693,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
  * The caller has derived the cpumask from the mm_struct.  This function
  * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
  *
- * The cpumask is converted into a nodemask of the nodes containing
- * the cpus.
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
  *
  * Note that this function should be called with preemption disabled.
  *
@@ -336,52 +706,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
-	int i;
-	int bit;
-	int pnode;
-	int uv_cpu;
-	int this_pnode;
+	int remotes;
+	int tcpu;
+	int uvhub;
 	int locals = 0;
 	struct bau_desc *bau_desc;
+	struct cpumask *flush_mask;
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
-	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (nobau)
+		return cpumask;
 
-	uv_cpu = uv_blade_processor_id();
-	this_pnode = uv_hub_info->pnode;
-	bau_desc = __get_cpu_var(bau_control).descriptor_base;
-	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
+	bcp = &per_cpu(bau_control, cpu);
+	/*
+	 * Each sending cpu has a per-cpu mask which it fills from the caller's
+	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+	 */
+	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+	/*
+	 * copy cpumask to flush_mask, removing current cpu
+	 * (current cpu should already have been flushed by the caller and
+	 *  should never be returned if we return flush_mask)
+	 */
+	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (cpu_isset(cpu, *cpumask))
+		locals++;  /* current cpu was targeted */
 
-	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	bau_desc = bcp->descriptor_base;
+	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
 
-	i = 0;
-	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
-		if (pnode == this_pnode) {
+	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	remotes = 0;
+	for_each_cpu(tcpu, flush_mask) {
+		uvhub = uv_cpu_to_blade_id(tcpu);
+		if (uvhub == bcp->uvhub) {
 			locals++;
 			continue;
 		}
-		bau_node_set(pnode - uv_partition_base_pnode,
-				&bau_desc->distribution);
-		i++;
+		bau_uvhub_set(uvhub, &bau_desc->distribution);
+		remotes++;
 	}
-	if (i == 0) {
+	if (remotes == 0) {
 		/*
-		 * no off_node flushing; return status for local node
+		 * No off_hub flushing; return status for local hub.
+		 * Return the caller's mask if all were local (the current
+		 * cpu may be in that mask).
 		 */
 		if (locals)
-			return flush_mask;
+			return cpumask;
 		else
 			return NULL;
 	}
-	__get_cpu_var(ptcstats).requestor++;
-	__get_cpu_var(ptcstats).ntargeted += i;
+	stat = &per_cpu(ptcstats, cpu);
+	stat->s_requestor++;
+	stat->s_ntargcpu += remotes;
+	remotes = bau_uvhub_weight(&bau_desc->distribution);
+	stat->s_ntarguvhub += remotes;
+	if (remotes >= 16)
+		stat->s_ntarguvhub16++;
+	else if (remotes >= 8)
+		stat->s_ntarguvhub8++;
+	else if (remotes >= 4)
+		stat->s_ntarguvhub4++;
+	else if (remotes >= 2)
+		stat->s_ntarguvhub2++;
+	else
+		stat->s_ntarguvhub1++;
 
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = cpu;
 
-	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
+	/*
+	 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+	 * the adjusted flush_mask if any cpu's were not messaged.
+	 */
+	return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
 }
 
 /*
@@ -390,87 +790,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
  *
  * We received a broadcast assist message.
  *
- * Interrupts may have been disabled; this interrupt could represent
+ * Interrupts are disabled; this interrupt could represent
  * the receipt of several messages.
  *
- * All cores/threads on this node get this interrupt.
- * The last one to see it does the s/w ack.
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
  * (the resource will not be freed until noninterruptable cpus see this
- *  interrupt; hardware will timeout the s/w ack and reply ERROR)
+ *  interrupt; hardware may timeout the s/w ack and reply ERROR)
  */
 void uv_bau_message_interrupt(struct pt_regs *regs)
 {
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_payload_queue_entry *msg;
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	cycles_t time1;
-	cycles_t time2;
-	int msg_slot;
-	int sw_ack_slot;
-	int fw;
 	int count = 0;
-	unsigned long local_pnode;
-
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
-	time1 = get_cycles();
-
-	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
-
-	va_queue_first = __get_cpu_var(bau_control).va_queue_first;
-	va_queue_last = __get_cpu_var(bau_control).va_queue_last;
-
-	msg = __get_cpu_var(bau_control).bau_msg_head;
+	cycles_t time_start;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *bcp;
+	struct ptc_stats *stat;
+	struct msg_desc msgdesc;
+
+	time_start = get_cycles();
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	stat = &per_cpu(ptcstats, smp_processor_id());
+	msgdesc.va_queue_first = bcp->va_queue_first;
+	msgdesc.va_queue_last = bcp->va_queue_last;
+	msg = bcp->bau_msg_head;
 	while (msg->sw_ack_vector) {
 		count++;
-		fw = msg->sw_ack_vector;
-		msg_slot = msg - va_queue_first;
-		sw_ack_slot = ffs(fw) - 1;
-
-		uv_bau_process_message(msg, msg_slot, sw_ack_slot);
-
+		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
+		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+		msgdesc.msg = msg;
+		uv_bau_process_message(&msgdesc, bcp);
 		msg++;
-		if (msg > va_queue_last)
-			msg = va_queue_first;
-		__get_cpu_var(bau_control).bau_msg_head = msg;
+		if (msg > msgdesc.va_queue_last)
+			msg = msgdesc.va_queue_first;
+		bcp->bau_msg_head = msg;
 	}
+	stat->d_time += (get_cycles() - time_start);
 	if (!count)
-		__get_cpu_var(ptcstats).nomsg++;
+		stat->d_nomsg++;
 	else if (count > 1)
-		__get_cpu_var(ptcstats).multmsg++;
-
-	time2 = get_cycles();
-	__get_cpu_var(ptcstats).dflush += (time2 - time1);
-
-	irq_exit();
-	set_irq_regs(old_regs);
+		stat->d_multmsg++;
+	ack_APIC_irq();
 }
 
 /*
  * uv_enable_timeouts
  *
- * Each target blade (i.e. blades that have cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
  * shootdown message timeouts enabled.  The timeout does not cause
  * an interrupt, but causes an error message to be returned to
  * the sender.
  */
 static void uv_enable_timeouts(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int nuvhubs;
 	int pnode;
 	unsigned long mmr_image;
 
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	for (blade = 0; blade < nblades; blade++) {
-		if (!uv_blade_nr_possible_cpus(blade))
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!uv_blade_nr_possible_cpus(uvhub))
 			continue;
 
-		pnode = uv_blade_to_pnode(blade);
+		pnode = uv_blade_to_pnode(uvhub);
 		mmr_image =
 		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
 		/*
@@ -523,9 +906,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
 
+static inline unsigned long long
+millisec_2_cycles(unsigned long millisec)
+{
+	unsigned long ns;
+	unsigned long long cyc;
+
+	ns = millisec * 1000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
 /*
- * Display the statistics thru /proc
- * data points to the cpu number
+ * Display the statistics thru /proc.
+ * 'data' points to the cpu number
  */
 static int uv_ptc_seq_show(struct seq_file *file, void *data)
 {
@@ -536,78 +930,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 	if (!cpu) {
 		seq_printf(file,
-		"# cpu requestor requestee one all sretry dretry ptc_i ");
+			"# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-		"sw_ack sflush dflush sok dnomsg dmult starget\n");
+			"numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+		seq_printf(file,
+			"retries rok resetp resett giveup sto bz throt ");
+		seq_printf(file,
+			"sw_ack recv rtime all ");
+		seq_printf(file,
+			"one mult none retry canc nocan reset rcan\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
-		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
-			   cpu, stat->requestor,
-			   stat->requestee, stat->onetlb, stat->alltlb,
-			   stat->s_retry, stat->d_retry, stat->ptc_i);
-		seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+		/* source side statistics */
+		seq_printf(file,
+			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+			   stat->s_ntarguvhub, stat->s_ntarguvhub16,
+			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+			   stat->s_ntargcpu, stat->s_dtimeout);
+		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+			   stat->s_retry_messages, stat->s_retriesok,
+			   stat->s_resets_plug, stat->s_resets_timeout,
+			   stat->s_giveup, stat->s_stimeout,
+			   stat->s_busy, stat->s_throttles);
+		/* destination side statistics */
+		seq_printf(file,
+			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
 			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-			   stat->sflush, stat->dflush,
-			   stat->retriesok, stat->nomsg,
-			   stat->multmsg, stat->ntargeted);
+			   stat->d_requestee, cycles_2_us(stat->d_time),
+			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
+			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
+			   stat->d_nocanceled, stat->d_resets,
+			   stat->d_rcanceled);
 	}
 
 	return 0;
 }
 
 /*
+ * -1: resetf the statistics
  *  0: display meaning of the statistics
- * >0: retry limit
+ * >0: maximum concurrent active descriptors per uvhub (throttle)
  */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 				 size_t count, loff_t *data)
 {
-	long newmode;
+	int cpu;
+	long input_arg;
 	char optstr[64];
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
 	if (count == 0 || count > sizeof(optstr))
 		return -EINVAL;
 	if (copy_from_user(optstr, user, count))
 		return -EFAULT;
 	optstr[count - 1] = '\0';
-	if (strict_strtoul(optstr, 10, &newmode) < 0) {
+	if (strict_strtol(optstr, 10, &input_arg) < 0) {
 		printk(KERN_DEBUG "%s is invalid\n", optstr);
 		return -EINVAL;
 	}
 
-	if (newmode == 0) {
+	if (input_arg == 0) {
 		printk(KERN_DEBUG "# cpu:      cpu number\n");
+		printk(KERN_DEBUG "Sender statistics:\n");
+		printk(KERN_DEBUG
+		"sent:     number of shootdown messages sent\n");
+		printk(KERN_DEBUG
+		"stime:    time spent sending messages\n");
+		printk(KERN_DEBUG
+		"numuvhubs: number of hubs targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"numuvhubs16: number times 16 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs8: number times 8 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs4: number times 4 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs2: number times 2 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs1: number times 1 hub targeted\n");
+		printk(KERN_DEBUG
+		"numcpus:  number of cpus targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"dto:      number of destination timeouts\n");
+		printk(KERN_DEBUG
+		"retries:  destination timeout retries sent\n");
+		printk(KERN_DEBUG
+		"rok:   :  destination timeouts successfully retried\n");
+		printk(KERN_DEBUG
+		"resetp:   ipi-style resource resets for plugs\n");
+		printk(KERN_DEBUG
+		"resett:   ipi-style resource resets for timeouts\n");
+		printk(KERN_DEBUG
+		"giveup:   fall-backs to ipi-style shootdowns\n");
+		printk(KERN_DEBUG
+		"sto:      number of source timeouts\n");
+		printk(KERN_DEBUG
+		"bz:       number of stay-busy's\n");
+		printk(KERN_DEBUG
+		"throt:    number times spun in throttle\n");
+		printk(KERN_DEBUG "Destination side statistics:\n");
 		printk(KERN_DEBUG
-		"requestor:  times this cpu was the flush requestor\n");
+		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
 		printk(KERN_DEBUG
-		"requestee:  times this cpu was requested to flush its TLBs\n");
+		"recv:     shootdown messages received\n");
 		printk(KERN_DEBUG
-		"one:        times requested to flush a single address\n");
+		"rtime:    time spent processing messages\n");
 		printk(KERN_DEBUG
-		"all:        times requested to flush all TLB's\n");
+		"all:      shootdown all-tlb messages\n");
 		printk(KERN_DEBUG
-		"sretry:     number of retries of source-side timeouts\n");
+		"one:      shootdown one-tlb messages\n");
 		printk(KERN_DEBUG
-		"dretry:     number of retries of destination-side timeouts\n");
+		"mult:     interrupts that found multiple messages\n");
 		printk(KERN_DEBUG
-		"ptc_i:      times UV fell through to IPI-style flushes\n");
+		"none:     interrupts that found no messages\n");
 		printk(KERN_DEBUG
-		"sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+		"retry:    number of retry messages processed\n");
 		printk(KERN_DEBUG
-		"sflush_us:  cycles spent in uv_flush_tlb_others()\n");
+		"canc:     number messages canceled by retries\n");
 		printk(KERN_DEBUG
-		"dflush_us:  cycles spent in handling flush requests\n");
-		printk(KERN_DEBUG "sok:        successes on retry\n");
-		printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
+		"nocan:    number retries that found nothing to cancel\n");
 		printk(KERN_DEBUG
-		"dmult:      interrupts with multiple messages\n");
-		printk(KERN_DEBUG "starget:    nodes targeted\n");
+		"reset:    number of ipi-style reset requests processed\n");
+		printk(KERN_DEBUG
+		"rcan:     number messages canceled by reset requests\n");
+	} else if (input_arg == -1) {
+		for_each_present_cpu(cpu) {
+			stat = &per_cpu(ptcstats, cpu);
+			memset(stat, 0, sizeof(struct ptc_stats));
+		}
 	} else {
-		uv_bau_retry_limit = newmode;
-		printk(KERN_DEBUG "timeout retry limit:%d\n",
-		       uv_bau_retry_limit);
+		uv_bau_max_concurrent = input_arg;
+		bcp = &per_cpu(bau_control, smp_processor_id());
+		if (uv_bau_max_concurrent < 1 ||
+		    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
+			printk(KERN_DEBUG
+				"Error: BAU max concurrent %d; %d is invalid\n",
+				bcp->max_concurrent, uv_bau_max_concurrent);
+			return -EINVAL;
+		}
+		printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
+		       uv_bau_max_concurrent);
+		for_each_present_cpu(cpu) {
+			bcp = &per_cpu(bau_control, cpu);
+			bcp->max_concurrent = uv_bau_max_concurrent;
+		}
 	}
 
 	return count;
@@ -650,80 +1121,31 @@ static int __init uv_ptc_init(void)
 	return 0;
 }
 
-/*
- * begin the initialization of the per-blade control structures
- */
-static struct bau_control * __init uv_table_bases_init(int blade, int node)
-{
-	int i;
-	struct bau_msg_status *msp;
-	struct bau_control *bau_tabp;
-
-	bau_tabp =
-	    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
-	BUG_ON(!bau_tabp);
-
-	bau_tabp->msg_statuses =
-	    kmalloc_node(sizeof(struct bau_msg_status) *
-			 DEST_Q_SIZE, GFP_KERNEL, node);
-	BUG_ON(!bau_tabp->msg_statuses);
-
-	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
-		bau_cpubits_clear(&msp->seen_by, (int)
-				  uv_blade_nr_possible_cpus(blade));
-
-	uv_bau_table_bases[blade] = bau_tabp;
-
-	return bau_tabp;
-}
-
-/*
- * finish the initialization of the per-blade control structures
- */
-static void __init
-uv_table_bases_finish(int blade,
-		      struct bau_control *bau_tablesp,
-		      struct bau_desc *adp)
-{
-	struct bau_control *bcp;
-	int cpu;
-
-	for_each_present_cpu(cpu) {
-		if (blade != uv_cpu_to_blade_id(cpu))
-			continue;
-
-		bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
-		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
-		bcp->va_queue_first	= bau_tablesp->va_queue_first;
-		bcp->va_queue_last	= bau_tablesp->va_queue_last;
-		bcp->msg_statuses	= bau_tablesp->msg_statuses;
-		bcp->descriptor_base	= adp;
-	}
-}
-
 /*
  * initialize the sending side's sending buffers
  */
-static struct bau_desc * __init
+static void
 uv_activation_descriptor_init(int node, int pnode)
 {
 	int i;
+	int cpu;
 	unsigned long pa;
 	unsigned long m;
 	unsigned long n;
-	struct bau_desc *adp;
-	struct bau_desc *ad2;
+	struct bau_desc *bau_desc;
+	struct bau_desc *bd2;
+	struct bau_control *bcp;
 
 	/*
 	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
 	 */
-	adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
 		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
-	BUG_ON(!adp);
+	BUG_ON(!bau_desc);
 
-	pa = uv_gpa(adp); /* need the real nasid*/
-	n = uv_gpa_to_pnode(pa);
+	pa = uv_gpa(bau_desc); /* need the real nasid*/
+	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
 
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -732,96 +1154,188 @@ uv_activation_descriptor_init(int node, int pnode)
 	/*
 	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
 	 * cpu even though we only use the first one; one descriptor can
-	 * describe a broadcast to 256 nodes.
+	 * describe a broadcast to 256 uv hubs.
 	 */
-	for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-		i++, ad2++) {
-		memset(ad2, 0, sizeof(struct bau_desc));
-		ad2->header.sw_ack_flag = 1;
+	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+		i++, bd2++) {
+		memset(bd2, 0, sizeof(struct bau_desc));
+		bd2->header.sw_ack_flag = 1;
 		/*
-		 * base_dest_nodeid is the first node in the partition, so
-		 * the bit map will indicate partition-relative node numbers.
-		 * note that base_dest_nodeid is actually a nasid.
+		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
+		 * in the partition. The bit map will indicate uvhub numbers,
+		 * which are 0-N in a partition. Pnodes are unique system-wide.
 		 */
-		ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
-		ad2->header.dest_subnodeid = 0x10; /* the LB */
-		ad2->header.command = UV_NET_ENDPOINT_INTD;
-		ad2->header.int_both = 1;
+		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+		bd2->header.dest_subnodeid = 0x10; /* the LB */
+		bd2->header.command = UV_NET_ENDPOINT_INTD;
+		bd2->header.int_both = 1;
 		/*
 		 * all others need to be set to zero:
 		 *   fairness chaining multilevel count replied_to
 		 */
 	}
-	return adp;
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+			continue;
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->descriptor_base = bau_desc;
+	}
 }
 
 /*
  * initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
  */
-static struct bau_payload_queue_entry * __init
-uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
+static void
+uv_payload_queue_init(int node, int pnode)
 {
-	struct bau_payload_queue_entry *pqp;
-	unsigned long pa;
 	int pn;
+	int cpu;
 	char *cp;
+	unsigned long pa;
+	struct bau_payload_queue_entry *pqp;
+	struct bau_payload_queue_entry *pqp_malloc;
+	struct bau_control *bcp;
 
 	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
 		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
 		GFP_KERNEL, node);
 	BUG_ON(!pqp);
+	pqp_malloc = pqp;
 
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-	bau_tablesp->va_queue_first = pqp;
+
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_cpu_to_pnode(cpu))
+			continue;
+		/* for every cpu on this pnode: */
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->va_queue_first = pqp;
+		bcp->bau_msg_head = pqp;
+		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+	}
 	/*
 	 * need the pnode of where the memory was really allocated
 	 */
 	pa = uv_gpa(pqp);
-	pn = uv_gpa_to_pnode(pa);
+	pn = pa >> uv_nshift;
 	uv_write_global_mmr64(pnode,
 			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
 			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
 			      uv_physnodeaddr(pqp));
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
 			      uv_physnodeaddr(pqp));
-	bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
 			      (unsigned long)
-			      uv_physnodeaddr(bau_tablesp->va_queue_last));
+			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+	/* in effect, all msg_type's are set to MSG_NOOP */
 	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-
-	return pqp;
 }
 
 /*
- * Initialization of each UV blade's structures
+ * Initialization of each UV hub's structures
  */
-static int __init uv_init_blade(int blade)
+static void __init uv_init_uvhub(int uvhub, int vector)
 {
 	int node;
 	int pnode;
-	unsigned long pa;
 	unsigned long apicid;
-	struct bau_desc *adp;
-	struct bau_payload_queue_entry *pqp;
-	struct bau_control *bau_tablesp;
-
-	node = blade_to_first_node(blade);
-	bau_tablesp = uv_table_bases_init(blade, node);
-	pnode = uv_blade_to_pnode(blade);
-	adp = uv_activation_descriptor_init(node, pnode);
-	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-	uv_table_bases_finish(blade, bau_tablesp, adp);
+
+	node = uvhub_to_first_node(uvhub);
+	pnode = uv_blade_to_pnode(uvhub);
+	uv_activation_descriptor_init(node, pnode);
+	uv_payload_queue_init(node, pnode);
 	/*
 	 * the below initialization can't be in firmware because the
 	 * messaging IRQ will be determined by the OS
 	 */
-	apicid = blade_to_first_apicid(blade);
-	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+	apicid = uvhub_to_first_apicid(uvhub);
 	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-				      ((apicid << 32) | UV_BAU_MESSAGE));
-	return 0;
+				      ((apicid << 32) | vector));
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static void uv_init_per_cpu(int nuvhubs)
+{
+	int i, j, k;
+	int cpu;
+	int pnode;
+	int uvhub;
+	short socket = 0;
+	struct bau_control *bcp;
+	struct uvhub_desc *bdp;
+	struct socket_desc *sdp;
+	struct bau_control *hmaster = NULL;
+	struct bau_control *smaster = NULL;
+	struct socket_desc {
+		short num_cpus;
+		short cpu_number[16];
+	};
+	struct uvhub_desc {
+		short num_sockets;
+		short num_cpus;
+		short uvhub;
+		short pnode;
+		struct socket_desc socket[2];
+	};
+	struct uvhub_desc *uvhub_descs;
+
+	uvhub_descs = (struct uvhub_desc *)
+		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		memset(bcp, 0, sizeof(struct bau_control));
+		spin_lock_init(&bcp->masks_lock);
+		bcp->max_concurrent = uv_bau_max_concurrent;
+		pnode = uv_cpu_hub_info(cpu)->pnode;
+		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+		bdp = &uvhub_descs[uvhub];
+		bdp->num_cpus++;
+		bdp->uvhub = uvhub;
+		bdp->pnode = pnode;
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval = millisec_2_cycles(3);
+		/* kludge: assume uv_hub.h is constant */
+		socket = (cpu_physical_id(cpu)>>5)&1;
+		if (socket >= bdp->num_sockets)
+			bdp->num_sockets = socket+1;
+		sdp = &bdp->socket[socket];
+		sdp->cpu_number[sdp->num_cpus] = cpu;
+		sdp->num_cpus++;
+	}
+	socket = 0;
+	for_each_possible_blade(uvhub) {
+		bdp = &uvhub_descs[uvhub];
+		for (i = 0; i < bdp->num_sockets; i++) {
+			sdp = &bdp->socket[i];
+			for (j = 0; j < sdp->num_cpus; j++) {
+				cpu = sdp->cpu_number[j];
+				bcp = &per_cpu(bau_control, cpu);
+				bcp->cpu = cpu;
+				if (j == 0) {
+					smaster = bcp;
+					if (i == 0)
+						hmaster = bcp;
+				}
+				bcp->cpus_in_uvhub = bdp->num_cpus;
+				bcp->cpus_in_socket = sdp->num_cpus;
+				bcp->socket_master = smaster;
+				bcp->uvhub_master = hmaster;
+				for (k = 0; k < DEST_Q_SIZE; k++)
+					bcp->socket_acknowledge_count[k] = 0;
+				bcp->uvhub_cpu =
+				  uv_cpu_hub_info(cpu)->blade_processor_id;
+			}
+			socket++;
+		}
+	}
+	kfree(uvhub_descs);
 }
 
 /*
@@ -829,38 +1343,54 @@ static int __init uv_init_blade(int blade)
  */
 static int __init uv_bau_init(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int pnode;
+	int nuvhubs;
 	int cur_cpu;
+	int vector;
+	unsigned long mmr;
 
 	if (!is_uv_system())
 		return 0;
 
+	if (nobau)
+		return 0;
+
 	for_each_possible_cpu(cur_cpu)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	uv_bau_retry_limit = 1;
+	uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	uv_bau_table_bases = (struct bau_control **)
-	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
-	BUG_ON(!uv_bau_table_bases);
+	uv_init_per_cpu(nuvhubs);
 
 	uv_partition_base_pnode = 0x7fffffff;
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade) &&
-			(uv_blade_to_pnode(blade) < uv_partition_base_pnode))
-			uv_partition_base_pnode = uv_blade_to_pnode(blade);
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade))
-			uv_init_blade(blade);
-
-	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+		if (uv_blade_nr_possible_cpus(uvhub) &&
+			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
+			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+
+	vector = UV_BAU_MESSAGE;
+	for_each_possible_blade(uvhub)
+		if (uv_blade_nr_possible_cpus(uvhub))
+			uv_init_uvhub(uvhub, vector);
+
 	uv_enable_timeouts();
+	alloc_intr_gate(vector, uv_bau_message_intr1);
+
+	for_each_possible_blade(uvhub) {
+		pnode = uv_blade_to_pnode(uvhub);
+		/* INIT the bau */
+		uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+				      ((unsigned long)1 << 63));
+		mmr = 1; /* should be 1 to broadcast to both sockets */
+		uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+	}
 
 	return 0;
 }
-__initcall(uv_bau_init);
-__initcall(uv_ptc_init);
+core_initcall(uv_bau_init);
+core_initcall(uv_ptc_init);
-- 
cgit v1.2.3-70-g09d2


From a289cc7c70da784a2d370b91885cab4f966dcb0f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 16 Apr 2010 17:51:42 -0700
Subject: x86, UV: uv_irq.c: Fix all sparse warnings

Fix all sparse warnings in building uv_irq.c.

 arch/x86/kernel/uv_irq.c:46:17: warning: symbol 'uv_irq_chip' was not declared. Should it be static?
 arch/x86/kernel/uv_irq.c:143:50: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:162:13: error: typename in expression
 arch/x86/kernel/uv_irq.c:162:13: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:250:44: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:260:17: error: typename in expression
 arch/x86/kernel/uv_irq.c:260:17: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:233:50: warning: incorrect type in argument 3 (different signedness)
 arch/x86/kernel/uv_irq.c:233:50:    expected int *pnode
 arch/x86/kernel/uv_irq.c:233:50:    got unsigned int *<noident>
 arch/x86/include/asm/uv/uv_hub.h:318:44: warning: incorrect type in argument 2 (different address spaces)
 arch/x86/include/asm/uv/uv_hub.h:318:44:    expected void volatile [noderef] <asn:2>*addr
 arch/x86/include/asm/uv/uv_hub.h:318:44:    got unsigned long *

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Cliff Wickman <cpw@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100416175142.f4b59683.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h |  2 +-
 arch/x86/kernel/uv_irq.c         | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 14cc74ba5d2..bf6b88ef8ee 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -307,7 +307,7 @@ static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset
  * Access Global MMR space using the MMR space located at the top of physical
  * memory.
  */
-static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
+static inline volatile void __iomem *uv_global_mmr64_address(int pnode, unsigned long offset)
 {
 	return __va(UV_GLOBAL_MMR64_BASE |
 		    UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8e324..1a9f55a3348 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -43,7 +43,7 @@ static void uv_ack_apic(unsigned int irq)
 	ack_APIC_irq();
 }
 
-struct irq_chip uv_irq_chip = {
+static struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
@@ -140,7 +140,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
  */
 static int
 arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int restrict)
+		       unsigned long mmr_offset, int limit)
 {
 	const struct cpumask *eligible_cpu = cpumask_of(cpu);
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -159,7 +159,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	if (err != 0)
 		return err;
 
-	if (restrict == UV_AFFINITY_CPU)
+	if (limit == UV_AFFINITY_CPU)
 		desc->status |= IRQ_NO_BALANCING;
 	else
 		desc->status |= IRQ_MOVE_PCNTXT;
@@ -213,7 +213,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 	unsigned long mmr_offset;
-	unsigned mmr_pnode;
+	int mmr_pnode;
 
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
@@ -247,7 +247,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
  * interrupt is raised.
  */
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
-		 unsigned long mmr_offset, int restrict)
+		 unsigned long mmr_offset, int limit)
 {
 	int irq, ret;
 
@@ -257,7 +257,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		return -EBUSY;
 
 	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		restrict);
+		limit);
 	if (ret == irq)
 		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
 	else
-- 
cgit v1.2.3-70-g09d2


From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Mon, 19 Apr 2010 13:32:41 +0800
Subject: perf: Enhance perf to allow for guest statistic collection from host

Below patch introduces perf_guest_info_callbacks and related
register/unregister functions. Add more PERF_RECORD_MISC_XXX bits
meaning guest kernel and guest user space.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/perf_event.h | 15 ++++-----------
 arch/x86/kernel/cpu/perf_event.c  | 31 +++++++++++++++++++++++++++++++
 include/linux/perf_event.h        | 21 ++++++++++++++++++++-
 kernel/perf_event.c               | 23 ++++++++++++++++++++++-
 4 files changed, 77 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f6d43dbfd8e..254883d0c7e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -135,17 +135,10 @@ extern void perf_events_lapic_init(void);
  */
 #define PERF_EFLAGS_EXACT	(1UL << 3)
 
-#define perf_misc_flags(regs)				\
-({	int misc = 0;					\
-	if (user_mode(regs))				\
-		misc |= PERF_RECORD_MISC_USER;		\
-	else						\
-		misc |= PERF_RECORD_MISC_KERNEL;	\
-	if (regs->flags & PERF_EFLAGS_EXACT)		\
-		misc |= PERF_RECORD_MISC_EXACT;		\
-	misc; })
-
-#define perf_instruction_pointer(regs)	((regs)->ip)
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
 
 #else
 static inline void init_hw_perf_events(void)		{ }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 626154a9f53..2ea78abf69d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1720,6 +1720,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
 	struct perf_callchain_entry *entry;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
+
 	if (in_nmi())
 		entry = &__get_cpu_var(pmc_nmi_entry);
 	else
@@ -1743,3 +1748,29 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long ip;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		ip = perf_guest_cbs->get_guest_ip();
+	else
+		ip = instruction_pointer(regs);
+	return ip;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	int misc = 0;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		misc |= perf_guest_cbs->is_user_mode() ?
+			PERF_RECORD_MISC_GUEST_USER :
+			PERF_RECORD_MISC_GUEST_KERNEL;
+	} else
+		misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
+			PERF_RECORD_MISC_KERNEL;
+	if (regs->flags & PERF_EFLAGS_EXACT)
+		misc |= PERF_RECORD_MISC_EXACT;
+
+	return misc;
+}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index bf896d0b2e9..24de5f181a4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -288,11 +288,13 @@ struct perf_event_mmap_page {
 	__u64	data_tail;		/* user-space written tail */
 };
 
-#define PERF_RECORD_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
 #define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_RECORD_MISC_KERNEL			(1 << 0)
 #define PERF_RECORD_MISC_USER			(2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
+#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
+#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
 #define PERF_RECORD_MISC_EXACT			(1 << 14)
 /*
@@ -446,6 +448,12 @@ enum perf_callchain_context {
 # include <asm/perf_event.h>
 #endif
 
+struct perf_guest_info_callbacks {
+	int (*is_in_guest) (void);
+	int (*is_user_mode) (void);
+	unsigned long (*get_guest_ip) (void);
+};
+
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 #include <asm/hw_breakpoint.h>
 #endif
@@ -932,6 +940,12 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)
 		__perf_event_mmap(vma);
 }
 
+extern struct perf_guest_info_callbacks *perf_guest_cbs;
+extern int perf_register_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+extern int perf_unregister_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
@@ -1001,6 +1015,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi,
 static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
+static inline int perf_register_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+static inline int perf_unregister_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 07b7a435bf0..9dbe8cdaf14 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2797,6 +2797,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 }
 
 
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = cbs;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
 /*
  * Output
  */
@@ -3749,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
 		.event_id  = {
 			.header = {
 				.type = PERF_RECORD_MMAP,
-				.misc = 0,
+				.misc = PERF_RECORD_MISC_USER,
 				/* .size */
 			},
 			/* .pid */
-- 
cgit v1.2.3-70-g09d2


From dcf46b9443ad48a227a61713adea001228925adf Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Tue, 20 Apr 2010 10:13:58 +0800
Subject: perf & kvm: Clean up some of the guest profiling callback API details

Fix some build bug and programming style issues:

 - use valid C
 - fix up various style details

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sheng Yang <sheng@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: oerg Roedel <joro@8bytes.org>
Cc: Jes Sorensen <Jes.Sorensen@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Zachary Amsden <zamsden@redhat.com>
Cc: zhiteng.huang@intel.com
Cc: tim.c.chen@intel.com
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <1271729638.2078.624.camel@ymzhang.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++------
 arch/x86/kvm/x86.c               |  4 ++++
 include/linux/perf_event.h       | 10 ++++------
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2ea78abf69d..7de70613e6c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1752,23 +1752,31 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
+
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
 		ip = perf_guest_cbs->get_guest_ip();
 	else
 		ip = instruction_pointer(regs);
+
 	return ip;
 }
 
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
 	int misc = 0;
+
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		misc |= perf_guest_cbs->is_user_mode() ?
-			PERF_RECORD_MISC_GUEST_USER :
-			PERF_RECORD_MISC_GUEST_KERNEL;
-	} else
-		misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
-			PERF_RECORD_MISC_KERNEL;
+		if (perf_guest_cbs->is_user_mode())
+			misc |= PERF_RECORD_MISC_GUEST_USER;
+		else
+			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+	} else {
+		if (user_mode(regs))
+			misc |= PERF_RECORD_MISC_USER;
+		else
+			misc |= PERF_RECORD_MISC_KERNEL;
+	}
+
 	if (regs->flags & PERF_EFLAGS_EXACT)
 		misc |= PERF_RECORD_MISC_EXACT;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3a33b2bb16..21b9b6aa3e8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3776,16 +3776,20 @@ static int kvm_is_in_guest(void)
 static int kvm_is_user_mode(void)
 {
 	int user_mode = 3;
+
 	if (percpu_read(current_vcpu))
 		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+
 	return user_mode != 0;
 }
 
 static unsigned long kvm_get_guest_ip(void)
 {
 	unsigned long ip = 0;
+
 	if (percpu_read(current_vcpu))
 		ip = kvm_rip_read(percpu_read(current_vcpu));
+
 	return ip;
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24de5f181a4..ace31fbac51 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -941,10 +941,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)
 }
 
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
-extern int perf_register_guest_info_callbacks(
-		struct perf_guest_info_callbacks *);
-extern int perf_unregister_guest_info_callbacks(
-		struct perf_guest_info_callbacks *);
+extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
+extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
@@ -1016,9 +1014,9 @@ static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
 static inline int perf_register_guest_info_callbacks
-(struct perf_guest_info_callbacks *) {return 0; }
+(struct perf_guest_info_callbacks *callbacks) { return 0; }
 static inline int perf_unregister_guest_info_callbacks
-(struct perf_guest_info_callbacks *) {return 0; }
+(struct perf_guest_info_callbacks *callbacks) { return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
-- 
cgit v1.2.3-70-g09d2


From aa2110cb1a7510f9b834adfb39b05d4843a35d35 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 8 Apr 2010 14:34:27 +0800
Subject: ACPI: add boot option acpi=copy_dsdt to fix corrupt DSDT

Some BIOS on Toshiba machines corrupt the DSDT, so add a new
boot option acpi=copy_dsdt to workaround it.
Add warning message to ask users to use this option if corrupt DSDT detected.

Also build a DMI blacklist to check it and automatically copy DSDT.

https://bugzilla.kernel.org/show_bug.cgi?id=14679

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/kernel-parameters.txt |  1 +
 arch/x86/kernel/acpi/boot.c         |  4 ++++
 drivers/acpi/acpica/tbutils.c       |  4 ++++
 drivers/acpi/bus.c                  | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e2202e93b14..93f113a1b32 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -151,6 +151,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			strict -- Be less tolerant of platforms that are not
 				strictly ACPI specification compliant.
 			rsdt -- prefer RSDT over (default) XSDT
+			copy_dsdt -- copy DSDT to memory
 
 			See also Documentation/power/pm.txt, pci=noacpi
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa9..b9629384263 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1576,6 +1576,10 @@ static int __init parse_acpi(char *arg)
 	/* "acpi=noirq" disables ACPI interrupt routing */
 	else if (strcmp(arg, "noirq") == 0) {
 		acpi_noirq_set();
+	}
+	/* "acpi=copy_dsdt" copys DSDT */
+	else if (strcmp(arg, "copy_dsdt") == 0) {
+		acpi_gbl_copy_dsdt_locally = 1;
 	} else {
 		/* Core will printk when we return error. */
 		return -EINVAL;
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 54a8712bae6..a9b105fc2e5 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -373,6 +373,10 @@ void acpi_tb_check_dsdt_header(void)
 		acpi_tb_print_table_header(0, &acpi_gbl_original_dsdt_header);
 		acpi_tb_print_table_header(0, acpi_gbl_DSDT);
 
+		ACPI_ERROR((AE_INFO,
+			    "Please send DMI info to linux-acpi@vger.kernel.org\n"
+			    "If system does not work as expected, please boot with acpi=copy_dsdt"));
+
 		/* Disable further error messages */
 
 		acpi_gbl_original_dsdt_header.length = acpi_gbl_DSDT->length;
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 37132dc2da0..49af19bb8c9 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -69,6 +69,37 @@ static struct dmi_system_id __cpuinitdata power_nocheck_dmi_table[] = {
 };
 
 
+static int set_copy_dsdt(const struct dmi_system_id *id)
+{
+	printk(KERN_NOTICE "%s detected - "
+		"force copy of DSDT to local memory\n", id->ident);
+	acpi_gbl_copy_dsdt_locally = 1;
+	return 0;
+}
+
+static struct dmi_system_id dsdt_dmi_table[] __initdata = {
+	/*
+	 * Insyde BIOS on some TOSHIBA machines corrupt the DSDT.
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=14679
+	 */
+	{
+	 .callback = set_copy_dsdt,
+	 .ident = "TOSHIBA Satellite A505",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Satellite A505"),
+		},
+	},
+	{
+	 .callback = set_copy_dsdt,
+	 .ident = "TOSHIBA Satellite L505D",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Satellite L505D"),
+		},
+	}
+};
+
 /* --------------------------------------------------------------------------
                                 Device Management
    -------------------------------------------------------------------------- */
@@ -813,6 +844,12 @@ void __init acpi_early_init(void)
 
 	acpi_gbl_permanent_mmap = 1;
 
+	/*
+	 * If the machine falls into the DMI check table,
+	 * DSDT will be copied to memory
+	 */
+	dmi_check_system(dsdt_dmi_table);
+
 	status = acpi_reallocate_root_table();
 	if (ACPI_FAILURE(status)) {
 		printk(KERN_ERR PREFIX
-- 
cgit v1.2.3-70-g09d2


From ae7c9b70dcb4313ea3dbcc9a2f240dae6c2b50c0 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 19 Apr 2010 11:23:43 -0700
Subject: x86, mrst: Conditionally register cpu hotplug notifier for apbt

APB timer is used on Moorestown platforms but not on a standard PC.
If APB timer code is compiled in but not initialized at run-time due
to lack of FW reported SFI table, kernel would panic when the non-boot
CPUs are offlined and notifier is called.

https://bugzilla.kernel.org/show_bug.cgi?id=15786

This patch ensures CPU hotplug notifier for APB timer is only registered
when the APBT timer block is initialized.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1271701423-1162-1-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apb_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index ff469e47005..a35347501d3 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -429,7 +429,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 
 static __init int apbt_late_init(void)
 {
-	if (disable_apbt_percpu)
+	if (disable_apbt_percpu || !apb_timer_block_enabled)
 		return 0;
 	/* This notifier should be called after workqueue is ready */
 	hotcpu_notifier(apbt_cpuhp_notify, -20);
-- 
cgit v1.2.3-70-g09d2


From b1ab1b4d9ab9812c77843abec79030292ef0a544 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:06:58 +0200
Subject: x86, cacheinfo: Unify AMD L3 cache index disable checking

All F10h CPUs starting with model 8 resp. 9, stepping 1, support L3
cache index disable. Concentrate the family, model, stepping checking at
one place and enable the feature implicitly on upcoming Fam10h models.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-2-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a5..acfb0838390 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -328,18 +328,22 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
-	if (index < 3)
+	if (boot_cpu_data.x86 != 0x10)
 		return;
 
-	if (boot_cpu_data.x86 == 0x11)
+	if (index < 3)
 		return;
 
 	/* see errata #382 and #388 */
-	if ((boot_cpu_data.x86 == 0x10) &&
-	    ((boot_cpu_data.x86_model < 0x8) ||
-	     (boot_cpu_data.x86_mask  < 0x1)))
+	if (boot_cpu_data.x86_model < 0x8)
 		return;
 
+	if ((boot_cpu_data.x86_model == 0x8 ||
+	     boot_cpu_data.x86_model == 0x9)
+		&&
+	     boot_cpu_data.x86_mask < 0x1)
+			return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
@@ -443,8 +447,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-		if (boot_cpu_data.x86 >= 0x10)
-			amd_check_l3_disable(index, this_leaf);
+		amd_check_l3_disable(index, this_leaf);
 	} else {
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
-- 
cgit v1.2.3-70-g09d2


From f2b20e41407fccfcfacf927ff91ec888832a37af Mon Sep 17 00:00:00 2001
From: Frank Arnold <frank.arnold@amd.com>
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index acfb0838390..5ab14c86c6e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -344,6 +344,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     boot_cpu_data.x86_mask < 0x1)
 			return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.2.3-70-g09d2


From 9350f982e4fe539e83a2d4a13e9b53ad8253c4a8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:00 +0200
Subject: x86, cacheinfo: Reorganize AMD L3 cache structure

Add a struct representing L3 cache attributes (subcache sizes and
indices count) and move the respective members out of _cpuid4_info.
Also, stash the struct pci_dev ptr into the struct simplifying the code
even more.

There should be no functionality change resulting from this patch except
slightly slimming the _cpuid4_info per-cpu vars.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-4-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 53 +++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 5ab14c86c6e..ff663ca63fd 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
 	u32 full;
 };
 
+struct amd_l3_cache {
+	struct	 pci_dev *dev;
+	bool	 can_disable;
+	unsigned indices;
+	u8	 subcaches[4];
+};
+
 struct _cpuid4_info {
 	union _cpuid4_leaf_eax eax;
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 	DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
 };
 
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 };
 
 unsigned short			num_cache_leaves;
@@ -302,7 +307,7 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
-static unsigned int __cpuinit amd_calc_l3_indices(void)
+static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
 	/*
 	 * We're called over smp_call_function_single() and therefore
@@ -317,12 +322,14 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 	pci_read_config_dword(dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
-	sc0 = !(val & BIT(0));
-	sc1 = !(val & BIT(4));
-	sc2 = !(val & BIT(8))  + !(val & BIT(9));
-	sc3 = !(val & BIT(12)) + !(val & BIT(13));
+	l3->subcaches[0] = sc0 = !(val & BIT(0));
+	l3->subcaches[1] = sc1 = !(val & BIT(4));
+	l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
+	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
 
-	return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+	l3->dev = dev;
 }
 
 static void __cpuinit
@@ -348,19 +355,23 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->can_disable = true;
-	this_leaf->l3_indices  = amd_calc_l3_indices();
+	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!this_leaf->l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return;
+	}
+
+	this_leaf->l3->can_disable = true;
+	amd_calc_l3_indices(this_leaf->l3);
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 				  unsigned int index)
 {
-	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!dev)
@@ -382,15 +393,14 @@ SHOW_CACHE_DISABLE(1)
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	const char *buf, size_t count, unsigned int index)
 {
+	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned long val = 0;
 
 #define SUBCACHE_MASK	(3UL << 20)
 #define SUBCACHE_INDEX	0xfff
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -404,7 +414,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 
 	/* do not allow writes outside of allowed bits */
 	if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
 	val |= BIT(30);
@@ -708,6 +718,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
 	for (i = 0; i < num_cache_leaves; i++)
 		cache_remove_shared_cpu_map(cpu, i);
 
+	kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
 	kfree(per_cpu(ici_cpuid4_info, cpu));
 	per_cpu(ici_cpuid4_info, cpu) = NULL;
 }
@@ -992,7 +1003,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 		this_leaf = CPUID4_INFO_IDX(cpu, i);
 
-		if (this_leaf->can_disable)
+		if (this_leaf->l3 && this_leaf->l3->can_disable)
 			ktype_cache.default_attrs = default_l3_attrs;
 		else
 			ktype_cache.default_attrs = default_attrs;
-- 
cgit v1.2.3-70-g09d2


From ba06edb63f5ef2913aad37070eaec3c9d8ac73b8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:01 +0200
Subject: x86, cacheinfo: Make L3 cache info per node

Currently, we're allocating L3 cache info and calculating indices for
each online cpu which is clearly superfluous. Instead, we need to do
this per-node as is each L3 cache.

No functional change, only per-cpu memory savings.

-v2: Allocate L3 cache descriptors array dynamically.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-5-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 59 ++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ff663ca63fd..1346e9c23fc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -307,19 +307,18 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
+
+/*
+ * L3 cache descriptors
+ */
+static struct amd_l3_cache **__cpuinitdata l3_caches;
+
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
-	/*
-	 * We're called over smp_call_function_single() and therefore
-	 * are on the correct cpu.
-	 */
-	int cpu = smp_processor_id();
-	int node = cpu_to_node(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned int sc0, sc1, sc2, sc3;
 	u32 val = 0;
 
-	pci_read_config_dword(dev, 0x1C4, &val);
+	pci_read_config_dword(l3->dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
 	l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -328,13 +327,31 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
 	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
+static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
+{
+	struct amd_l3_cache *l3;
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+
+	l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return NULL;
+	}
 
 	l3->dev = dev;
+
+	amd_calc_l3_indices(l3);
+
+	return l3;
 }
 
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
+	int node;
+
 	if (boot_cpu_data.x86 != 0x10)
 		return;
 
@@ -355,14 +372,28 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-	if (!this_leaf->l3) {
-		printk(KERN_WARNING "Error allocating L3 struct\n");
-		return;
+	/*
+	 * Strictly speaking, the amount in @size below is leaked since it is
+	 * never freed but this is done only on shutdown so it doesn't matter.
+	 */
+	if (!l3_caches) {
+		int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+
+		l3_caches = kzalloc(size, GFP_ATOMIC);
+		if (!l3_caches)
+			return;
 	}
 
-	this_leaf->l3->can_disable = true;
-	amd_calc_l3_indices(this_leaf->l3);
+	node = amd_get_nb_id(smp_processor_id());
+
+	if (!l3_caches[node]) {
+		l3_caches[node] = amd_init_l3_cache(node);
+		l3_caches[node]->can_disable = true;
+	}
+
+	WARN_ON(!l3_caches[node]);
+
+	this_leaf->l3 = l3_caches[node];
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-- 
cgit v1.2.3-70-g09d2


From 59d3b388741cf1a5eb7ad27fd4e9ed72643164ae Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:02 +0200
Subject: x86, cacheinfo: Disable index in all four subcaches

When disabling an L3 cache index, make sure we disable that index in
all four subcaches of the L3. Clarify nomenclature while at it, wrt to
disable slots versus disable index and rename accordingly.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-6-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 60 +++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1346e9c23fc..33eae2062cf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -397,7 +397,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-				  unsigned int index)
+				  unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
@@ -408,21 +408,53 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 	if (!dev)
 		return -EINVAL;
 
-	pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+	pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
 	return sprintf(buf, "0x%08x\n", reg);
 }
 
-#define SHOW_CACHE_DISABLE(index)					\
+#define SHOW_CACHE_DISABLE(slot)					\
 static ssize_t								\
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)	\
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf)	\
 {									\
-	return show_cache_disable(this_leaf, buf, index);		\
+	return show_cache_disable(this_leaf, buf, slot);		\
 }
 SHOW_CACHE_DISABLE(0)
 SHOW_CACHE_DISABLE(1)
 
+static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
+				 unsigned slot, unsigned long idx)
+{
+	int i;
+
+	idx |= BIT(30);
+
+	/*
+	 *  disable index in all 4 subcaches
+	 */
+	for (i = 0; i < 4; i++) {
+		u32 reg = idx | (i << 20);
+
+		if (!l3->subcaches[i])
+			continue;
+
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+
+		/*
+		 * We need to WBINVD on a core on the node containing the L3
+		 * cache which indices we disable therefore a simple wbinvd()
+		 * is not sufficient.
+		 */
+		wbinvd_on_cpu(cpu);
+
+		reg |= BIT(31);
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+	}
+}
+
+
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-	const char *buf, size_t count, unsigned int index)
+				   const char *buf, size_t count,
+				   unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -448,23 +480,17 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
-	val |= BIT(30);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val);
-	/*
-	 * We need to WBINVD on a core on the node containing the L3 cache which
-	 * indices we disable therefore a simple wbinvd() is not sufficient.
-	 */
-	wbinvd_on_cpu(cpu);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+	amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
+
 	return count;
 }
 
-#define STORE_CACHE_DISABLE(index)					\
+#define STORE_CACHE_DISABLE(slot)					\
 static ssize_t								\
-store_cache_disable_##index(struct _cpuid4_info *this_leaf,		\
+store_cache_disable_##slot(struct _cpuid4_info *this_leaf,		\
 			    const char *buf, size_t count)		\
 {									\
-	return store_cache_disable(this_leaf, buf, count, index);	\
+	return store_cache_disable(this_leaf, buf, count, slot);	\
 }
 STORE_CACHE_DISABLE(0)
 STORE_CACHE_DISABLE(1)
-- 
cgit v1.2.3-70-g09d2


From 7ce5a2b9bb2e92902230e3121d8c3047fab9cb47 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 23 Apr 2010 16:17:40 -0700
Subject: x86-64: Clear a 64-bit FS/GS base on fork if selector is nonzero

When we do a thread switch, we clear the outgoing FS/GS base if the
corresponding selector is nonzero.  This is taken by __switch_to() as
an entry invariant; it does not verify that it is true on entry.
However, copy_thread() doesn't enforce this constraint, which can
result in inconsistent results after fork().

Make copy_thread() match the behavior of __switch_to().

Reported-and-tested-by: Samuel Thibault <samuel.thibault@inria.fr>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <4BD1E061.8030605@zytor.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/process_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dc9690b4c4c..17cb3295cbf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -276,12 +276,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
 	set_tsk_thread_flag(p, TIF_FORK);
 
-	p->thread.fs = me->thread.fs;
-	p->thread.gs = me->thread.gs;
 	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
+	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 	savesegment(fs, p->thread.fsindex);
+	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
-- 
cgit v1.2.3-70-g09d2


From 7a0fc404ae663776e96db43879a0fa24fec1fa3a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 13 Apr 2010 14:40:54 -0700
Subject: x86: Disable large pages on CPUs with Atom erratum AAE44

Atom erratum AAE44/AAF40/AAG38/AAH41:

"If software clears the PS (page size) bit in a present PDE (page
directory entry), that will cause linear addresses mapped through this
PDE to use 4-KByte pages instead of using a large page after old TLB
entries are invalidated. Due to this erratum, if a code fetch uses
this PDE before the TLB entry for the large page is invalidated then
it may fetch from a different physical address than specified by
either the old large page translation or the new 4-KByte page
translation. This erratum may also cause speculative code fetches from
incorrect addresses."

[http://download.intel.com/design/processor/specupdt/319536.pdf]

Where as commit 211b3d03c7400f48a781977a50104c9d12f4e229 seems to
workaround errata AAH41 (mixed 4K TLBs) it reduces the window of
opportunity for the bug to occur and does not totally remove it.  This
patch disables mixed 4K/4MB page tables totally avoiding the page
splitting and not tripping this processor issue.

This is based on an original patch by Colin King.

Originally-by: Colin Ian King <colin.king@canonical.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <1269271251-19775-1-git-send-email-colin.king@canonical.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/cpu/intel.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af3..1366c7cfd48 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -47,6 +47,27 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 
+	/*
+	 * Atom erratum AAE44/AAF40/AAG38/AAH41:
+	 *
+	 * A race condition between speculative fetches and invalidating
+	 * a large page.  This is worked around in microcode, but we
+	 * need the microcode to have already been loaded... so if it is
+	 * not, recommend a BIOS update and disable large pages.
+	 */
+	if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
+		u32 ucode, junk;
+
+		wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+		sync_core();
+		rdmsr(MSR_IA32_UCODE_REV, junk, ucode);
+
+		if (ucode < 0x20e) {
+			printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+			clear_cpu_cap(c, X86_FEATURE_PSE);
+		}
+	}
+
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
 #else
-- 
cgit v1.2.3-70-g09d2


From 453dc65931915abc61f92e12bba1fc4747ff5542 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@vmware.com>
Date: Fri, 23 Apr 2010 13:18:08 -0400
Subject: VMware Balloon driver

This is a standalone version of VMware Balloon driver.  Ballooning is a
technique that allows hypervisor dynamically limit the amount of memory
available to the guest (with guest cooperation).  In the overcommit
scenario, when hypervisor set detects that it needs to shuffle some
memory, it instructs the driver to allocate certain number of pages, and
the underlying memory gets returned to the hypervisor.  Later hypervisor
may return memory to the guest by reattaching memory to the pageframes and
instructing the driver to "deflate" balloon.

We are submitting a standalone driver because KVM maintainer (Avi Kivity)
expressed opinion (rightly) that our transport does not fit well into
virtqueue paradigm and thus it does not make much sense to integrate with
virtio.

There were also some concerns whether current ballooning technique is the
right thing.  If there appears a better framework to achieve this we are
prepared to evaluate and switch to using it, but in the meantime we'd like
to get this driver upstream.

We want to get the driver accepted in distributions so that users do not
have to deal with an out-of-tree module and many distributions have
"upstream first" requirement.

The driver has been shipping for a number of years and users running on
VMware platform will have it installed as part of VMware Tools even if it
will not come from a distribution, thus there should not be additional
risk in pulling the driver into mainline.  The driver will only activate
if host is VMware so everyone else should not be affected at all.

Signed-off-by: Dmitry Torokhov <dtor@vmware.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/vmware.c  |   2 +
 drivers/misc/Kconfig          |  16 +
 drivers/misc/Makefile         |   1 +
 drivers/misc/vmware_balloon.c | 832 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 851 insertions(+)
 create mode 100644 drivers/misc/vmware_balloon.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59c..dfdb4dba232 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/dmi.h>
+#include <linux/module.h>
 #include <asm/div64.h>
 #include <asm/vmware.h>
 #include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)
 
 	return 0;
 }
+EXPORT_SYMBOL(vmware_platform);
 
 /*
  * VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 2191c8d896a..0d0d625fece 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -311,6 +311,22 @@ config TI_DAC7512
 	  This driver can also be built as a module. If so, the module
 	  will be calles ti_dac7512.
 
+config VMWARE_BALLOON
+	tristate "VMware Balloon Driver"
+	depends on X86
+	help
+	  This is VMware physical memory management driver which acts
+	  like a "balloon" that can be inflated to reclaim physical pages
+	  by reserving them in the guest and invalidating them in the
+	  monitor, freeing up the underlying machine pages so they can
+	  be allocated to other guests. The balloon can also be deflated
+	  to allow the guest to use more physical memory.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called vmware_balloon.
+
 source "drivers/misc/c2port/Kconfig"
 source "drivers/misc/eeprom/Kconfig"
 source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 27c48435541..7b6f7eefdf8 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_C2PORT)		+= c2port/
 obj-$(CONFIG_IWMC3200TOP)      += iwmc3200top/
 obj-y				+= eeprom/
 obj-y				+= cb710/
+obj-$(CONFIG_VMWARE_BALLOON)	+= vmware_balloon.o
diff --git a/drivers/misc/vmware_balloon.c b/drivers/misc/vmware_balloon.c
new file mode 100644
index 00000000000..e7161c4e379
--- /dev/null
+++ b/drivers/misc/vmware_balloon.c
@@ -0,0 +1,832 @@
+/*
+ * VMware Balloon driver.
+ *
+ * Copyright (C) 2000-2010, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Maintained by: Dmitry Torokhov <dtor@vmware.com>
+ */
+
+/*
+ * This is VMware physical memory management driver for Linux. The driver
+ * acts like a "balloon" that can be inflated to reclaim physical pages by
+ * reserving them in the guest and invalidating them in the monitor,
+ * freeing up the underlying machine pages so they can be allocated to
+ * other guests.  The balloon can also be deflated to allow the guest to
+ * use more physical memory. Higher level policies can control the sizes
+ * of balloons in VMs in order to manage physical memory resources.
+ */
+
+//#define DEBUG
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <asm/vmware.h>
+
+MODULE_AUTHOR("VMware, Inc.");
+MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
+MODULE_VERSION("1.2.1.0-K");
+MODULE_ALIAS("dmi:*:svnVMware*:*");
+MODULE_ALIAS("vmware_vmmemctl");
+MODULE_LICENSE("GPL");
+
+/*
+ * Various constants controlling rate of inflaint/deflating balloon,
+ * measured in pages.
+ */
+
+/*
+ * Rate of allocating memory when there is no memory pressure
+ * (driver performs non-sleeping allocations).
+ */
+#define VMW_BALLOON_NOSLEEP_ALLOC_MAX	16384U
+
+/*
+ * Rates of memory allocaton when guest experiences memory pressure
+ * (driver performs sleeping allocations).
+ */
+#define VMW_BALLOON_RATE_ALLOC_MIN	512U
+#define VMW_BALLOON_RATE_ALLOC_MAX	2048U
+#define VMW_BALLOON_RATE_ALLOC_INC	16U
+
+/*
+ * Rates for releasing pages while deflating balloon.
+ */
+#define VMW_BALLOON_RATE_FREE_MIN	512U
+#define VMW_BALLOON_RATE_FREE_MAX	16384U
+#define VMW_BALLOON_RATE_FREE_INC	16U
+
+/*
+ * When guest is under memory pressure, use a reduced page allocation
+ * rate for next several cycles.
+ */
+#define VMW_BALLOON_SLOW_CYCLES		4
+
+/*
+ * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
+ * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
+ * __GFP_NOWARN, to suppress page allocation failure warnings.
+ */
+#define VMW_PAGE_ALLOC_NOSLEEP		(__GFP_HIGHMEM|__GFP_NOWARN)
+
+/*
+ * Use GFP_HIGHUSER when executing in a separate kernel thread
+ * context and allocation can sleep.  This is less stressful to
+ * the guest memory system, since it allows the thread to block
+ * while memory is reclaimed, and won't take pages from emergency
+ * low-memory pools.
+ */
+#define VMW_PAGE_ALLOC_CANSLEEP		(GFP_HIGHUSER)
+
+/* Maximum number of page allocations without yielding processor */
+#define VMW_BALLOON_YIELD_THRESHOLD	1024
+
+
+/*
+ * Hypervisor communication port definitions.
+ */
+#define VMW_BALLOON_HV_PORT		0x5670
+#define VMW_BALLOON_HV_MAGIC		0x456c6d6f
+#define VMW_BALLOON_PROTOCOL_VERSION	2
+#define VMW_BALLOON_GUEST_ID		1	/* Linux */
+
+#define VMW_BALLOON_CMD_START		0
+#define VMW_BALLOON_CMD_GET_TARGET	1
+#define VMW_BALLOON_CMD_LOCK		2
+#define VMW_BALLOON_CMD_UNLOCK		3
+#define VMW_BALLOON_CMD_GUEST_ID	4
+
+/* error codes */
+#define VMW_BALLOON_SUCCESS		0
+#define VMW_BALLOON_FAILURE		-1
+#define VMW_BALLOON_ERROR_CMD_INVALID	1
+#define VMW_BALLOON_ERROR_PPN_INVALID	2
+#define VMW_BALLOON_ERROR_PPN_LOCKED	3
+#define VMW_BALLOON_ERROR_PPN_UNLOCKED	4
+#define VMW_BALLOON_ERROR_PPN_PINNED	5
+#define VMW_BALLOON_ERROR_PPN_NOTNEEDED	6
+#define VMW_BALLOON_ERROR_RESET		7
+#define VMW_BALLOON_ERROR_BUSY		8
+
+#define VMWARE_BALLOON_CMD(cmd, data, result)		\
+({							\
+	unsigned long __stat, __dummy1, __dummy2;	\
+	__asm__ __volatile__ ("inl (%%dx)" :		\
+		"=a"(__stat),				\
+		"=c"(__dummy1),				\
+		"=d"(__dummy2),				\
+		"=b"(result) :				\
+		"0"(VMW_BALLOON_HV_MAGIC),		\
+		"1"(VMW_BALLOON_CMD_##cmd),		\
+		"2"(VMW_BALLOON_HV_PORT),		\
+		"3"(data) :				\
+		"memory");				\
+	result &= -1UL;					\
+	__stat & -1UL;					\
+})
+
+#ifdef CONFIG_DEBUG_FS
+struct vmballoon_stats {
+	unsigned int timer;
+
+	/* allocation statustics */
+	unsigned int alloc;
+	unsigned int alloc_fail;
+	unsigned int sleep_alloc;
+	unsigned int sleep_alloc_fail;
+	unsigned int refused_alloc;
+	unsigned int refused_free;
+	unsigned int free;
+
+	/* monitor operations */
+	unsigned int lock;
+	unsigned int lock_fail;
+	unsigned int unlock;
+	unsigned int unlock_fail;
+	unsigned int target;
+	unsigned int target_fail;
+	unsigned int start;
+	unsigned int start_fail;
+	unsigned int guest_type;
+	unsigned int guest_type_fail;
+};
+
+#define STATS_INC(stat) (stat)++
+#else
+#define STATS_INC(stat)
+#endif
+
+struct vmballoon {
+
+	/* list of reserved physical pages */
+	struct list_head pages;
+
+	/* transient list of non-balloonable pages */
+	struct list_head refused_pages;
+
+	/* balloon size in pages */
+	unsigned int size;
+	unsigned int target;
+
+	/* reset flag */
+	bool reset_required;
+
+	/* adjustment rates (pages per second) */
+	unsigned int rate_alloc;
+	unsigned int rate_free;
+
+	/* slowdown page allocations for next few cycles */
+	unsigned int slow_allocation_cycles;
+
+#ifdef CONFIG_DEBUG_FS
+	/* statistics */
+	struct vmballoon_stats stats;
+
+	/* debugfs file exporting statistics */
+	struct dentry *dbg_entry;
+#endif
+
+	struct sysinfo sysinfo;
+
+	struct delayed_work dwork;
+};
+
+static struct vmballoon balloon;
+static struct workqueue_struct *vmballoon_wq;
+
+/*
+ * Send "start" command to the host, communicating supported version
+ * of the protocol.
+ */
+static bool vmballoon_send_start(struct vmballoon *b)
+{
+	unsigned long status, dummy;
+
+	STATS_INC(b->stats.start);
+
+	status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_PROTOCOL_VERSION, dummy);
+	if (status == VMW_BALLOON_SUCCESS)
+		return true;
+
+	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+	STATS_INC(b->stats.start_fail);
+	return false;
+}
+
+static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
+{
+	switch (status) {
+	case VMW_BALLOON_SUCCESS:
+		return true;
+
+	case VMW_BALLOON_ERROR_RESET:
+		b->reset_required = true;
+		/* fall through */
+
+	default:
+		return false;
+	}
+}
+
+/*
+ * Communicate guest type to the host so that it can adjust ballooning
+ * algorithm to the one most appropriate for the guest. This command
+ * is normally issued after sending "start" command and is part of
+ * standard reset sequence.
+ */
+static bool vmballoon_send_guest_id(struct vmballoon *b)
+{
+	unsigned long status, dummy;
+
+	status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
+
+	STATS_INC(b->stats.guest_type);
+
+	if (vmballoon_check_status(b, status))
+		return true;
+
+	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+	STATS_INC(b->stats.guest_type_fail);
+	return false;
+}
+
+/*
+ * Retrieve desired balloon size from the host.
+ */
+static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
+{
+	unsigned long status;
+	unsigned long target;
+	unsigned long limit;
+	u32 limit32;
+
+	/*
+	 * si_meminfo() is cheap. Moreover, we want to provide dynamic
+	 * max balloon size later. So let us call si_meminfo() every
+	 * iteration.
+	 */
+	si_meminfo(&b->sysinfo);
+	limit = b->sysinfo.totalram;
+
+	/* Ensure limit fits in 32-bits */
+	limit32 = (u32)limit;
+	if (limit != limit32)
+		return false;
+
+	/* update stats */
+	STATS_INC(b->stats.target);
+
+	status = VMWARE_BALLOON_CMD(GET_TARGET, limit, target);
+	if (vmballoon_check_status(b, status)) {
+		*new_target = target;
+		return true;
+	}
+
+	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+	STATS_INC(b->stats.target_fail);
+	return false;
+}
+
+/*
+ * Notify the host about allocated page so that host can use it without
+ * fear that guest will need it. Host may reject some pages, we need to
+ * check the return value and maybe submit a different page.
+ */
+static bool vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn)
+{
+	unsigned long status, dummy;
+	u32 pfn32;
+
+	pfn32 = (u32)pfn;
+	if (pfn32 != pfn)
+		return false;
+
+	STATS_INC(b->stats.lock);
+
+	status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy);
+	if (vmballoon_check_status(b, status))
+		return true;
+
+	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
+	STATS_INC(b->stats.lock_fail);
+	return false;
+}
+
+/*
+ * Notify the host that guest intends to release given page back into
+ * the pool of available (to the guest) pages.
+ */
+static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
+{
+	unsigned long status, dummy;
+	u32 pfn32;
+
+	pfn32 = (u32)pfn;
+	if (pfn32 != pfn)
+		return false;
+
+	STATS_INC(b->stats.unlock);
+
+	status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy);
+	if (vmballoon_check_status(b, status))
+		return true;
+
+	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
+	STATS_INC(b->stats.unlock_fail);
+	return false;
+}
+
+/*
+ * Quickly release all pages allocated for the balloon. This function is
+ * called when host decides to "reset" balloon for one reason or another.
+ * Unlike normal "deflate" we do not (shall not) notify host of the pages
+ * being released.
+ */
+static void vmballoon_pop(struct vmballoon *b)
+{
+	struct page *page, *next;
+	unsigned int count = 0;
+
+	list_for_each_entry_safe(page, next, &b->pages, lru) {
+		list_del(&page->lru);
+		__free_page(page);
+		STATS_INC(b->stats.free);
+		b->size--;
+
+		if (++count >= b->rate_free) {
+			count = 0;
+			cond_resched();
+		}
+	}
+}
+
+/*
+ * Perform standard reset sequence by popping the balloon (in case it
+ * is not  empty) and then restarting protocol. This operation normally
+ * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
+ */
+static void vmballoon_reset(struct vmballoon *b)
+{
+	/* free all pages, skipping monitor unlock */
+	vmballoon_pop(b);
+
+	if (vmballoon_send_start(b)) {
+		b->reset_required = false;
+		if (!vmballoon_send_guest_id(b))
+			pr_err("failed to send guest ID to the host\n");
+	}
+}
+
+/*
+ * Allocate (or reserve) a page for the balloon and notify the host.  If host
+ * refuses the page put it on "refuse" list and allocate another one until host
+ * is satisfied. "Refused" pages are released at the end of inflation cycle
+ * (when we allocate b->rate_alloc pages).
+ */
+static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep)
+{
+	struct page *page;
+	gfp_t flags;
+	bool locked = false;
+
+	do {
+		if (!can_sleep)
+			STATS_INC(b->stats.alloc);
+		else
+			STATS_INC(b->stats.sleep_alloc);
+
+		flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP;
+		page = alloc_page(flags);
+		if (!page) {
+			if (!can_sleep)
+				STATS_INC(b->stats.alloc_fail);
+			else
+				STATS_INC(b->stats.sleep_alloc_fail);
+			return -ENOMEM;
+		}
+
+		/* inform monitor */
+		locked = vmballoon_send_lock_page(b, page_to_pfn(page));
+		if (!locked) {
+			if (b->reset_required) {
+				__free_page(page);
+				return -EIO;
+			}
+
+			/* place on list of non-balloonable pages, retry allocation */
+			list_add(&page->lru, &b->refused_pages);
+			STATS_INC(b->stats.refused_alloc);
+		}
+	} while (!locked);
+
+	/* track allocated page */
+	list_add(&page->lru, &b->pages);
+
+	/* update balloon size */
+	b->size++;
+
+	return 0;
+}
+
+/*
+ * Release the page allocated for the balloon. Note that we first notify
+ * the host so it can make sure the page will be available for the guest
+ * to use, if needed.
+ */
+static int vmballoon_release_page(struct vmballoon *b, struct page *page)
+{
+	if (!vmballoon_send_unlock_page(b, page_to_pfn(page)))
+		return -EIO;
+
+	list_del(&page->lru);
+
+	/* deallocate page */
+	__free_page(page);
+	STATS_INC(b->stats.free);
+
+	/* update balloon size */
+	b->size--;
+
+	return 0;
+}
+
+/*
+ * Release pages that were allocated while attempting to inflate the
+ * balloon but were refused by the host for one reason or another.
+ */
+static void vmballoon_release_refused_pages(struct vmballoon *b)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, &b->refused_pages, lru) {
+		list_del(&page->lru);
+		__free_page(page);
+		STATS_INC(b->stats.refused_free);
+	}
+}
+
+/*
+ * Inflate the balloon towards its target size. Note that we try to limit
+ * the rate of allocation to make sure we are not choking the rest of the
+ * system.
+ */
+static void vmballoon_inflate(struct vmballoon *b)
+{
+	unsigned int goal;
+	unsigned int rate;
+	unsigned int i;
+	unsigned int allocations = 0;
+	int error = 0;
+	bool alloc_can_sleep = false;
+
+	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
+
+	/*
+	 * First try NOSLEEP page allocations to inflate balloon.
+	 *
+	 * If we do not throttle nosleep allocations, we can drain all
+	 * free pages in the guest quickly (if the balloon target is high).
+	 * As a side-effect, draining free pages helps to inform (force)
+	 * the guest to start swapping if balloon target is not met yet,
+	 * which is a desired behavior. However, balloon driver can consume
+	 * all available CPU cycles if too many pages are allocated in a
+	 * second. Therefore, we throttle nosleep allocations even when
+	 * the guest is not under memory pressure. OTOH, if we have already
+	 * predicted that the guest is under memory pressure, then we
+	 * slowdown page allocations considerably.
+	 */
+
+	goal = b->target - b->size;
+	/*
+	 * Start with no sleep allocation rate which may be higher
+	 * than sleeping allocation rate.
+	 */
+	rate = b->slow_allocation_cycles ?
+			b->rate_alloc : VMW_BALLOON_NOSLEEP_ALLOC_MAX;
+
+	pr_debug("%s - goal: %d, no-sleep rate: %d, sleep rate: %d\n",
+		 __func__, goal, rate, b->rate_alloc);
+
+	for (i = 0; i < goal; i++) {
+
+		error = vmballoon_reserve_page(b, alloc_can_sleep);
+		if (error) {
+			if (error != -ENOMEM) {
+				/*
+				 * Not a page allocation failure, stop this
+				 * cycle. Maybe we'll get new target from
+				 * the host soon.
+				 */
+				break;
+			}
+
+			if (alloc_can_sleep) {
+				/*
+				 * CANSLEEP page allocation failed, so guest
+				 * is under severe memory pressure. Quickly
+				 * decrease allocation rate.
+				 */
+				b->rate_alloc = max(b->rate_alloc / 2,
+						    VMW_BALLOON_RATE_ALLOC_MIN);
+				break;
+			}
+
+			/*
+			 * NOSLEEP page allocation failed, so the guest is
+			 * under memory pressure. Let us slow down page
+			 * allocations for next few cycles so that the guest
+			 * gets out of memory pressure. Also, if we already
+			 * allocated b->rate_alloc pages, let's pause,
+			 * otherwise switch to sleeping allocations.
+			 */
+			b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
+
+			if (i >= b->rate_alloc)
+				break;
+
+			alloc_can_sleep = true;
+			/* Lower rate for sleeping allocations. */
+			rate = b->rate_alloc;
+		}
+
+		if (++allocations > VMW_BALLOON_YIELD_THRESHOLD) {
+			cond_resched();
+			allocations = 0;
+		}
+
+		if (i >= rate) {
+			/* We allocated enough pages, let's take a break. */
+			break;
+		}
+	}
+
+	/*
+	 * We reached our goal without failures so try increasing
+	 * allocation rate.
+	 */
+	if (error == 0 && i >= b->rate_alloc) {
+		unsigned int mult = i / b->rate_alloc;
+
+		b->rate_alloc =
+			min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
+			    VMW_BALLOON_RATE_ALLOC_MAX);
+	}
+
+	vmballoon_release_refused_pages(b);
+}
+
+/*
+ * Decrease the size of the balloon allowing guest to use more memory.
+ */
+static void vmballoon_deflate(struct vmballoon *b)
+{
+	struct page *page, *next;
+	unsigned int i = 0;
+	unsigned int goal;
+	int error;
+
+	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
+
+	/* limit deallocation rate */
+	goal = min(b->size - b->target, b->rate_free);
+
+	pr_debug("%s - goal: %d, rate: %d\n", __func__, goal, b->rate_free);
+
+	/* free pages to reach target */
+	list_for_each_entry_safe(page, next, &b->pages, lru) {
+		error = vmballoon_release_page(b, page);
+		if (error) {
+			/* quickly decrease rate in case of error */
+			b->rate_free = max(b->rate_free / 2,
+					   VMW_BALLOON_RATE_FREE_MIN);
+			return;
+		}
+
+		if (++i >= goal)
+			break;
+	}
+
+	/* slowly increase rate if there were no errors */
+	b->rate_free = min(b->rate_free + VMW_BALLOON_RATE_FREE_INC,
+			   VMW_BALLOON_RATE_FREE_MAX);
+}
+
+/*
+ * Balloon work function: reset protocol, if needed, get the new size and
+ * adjust balloon as needed. Repeat in 1 sec.
+ */
+static void vmballoon_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
+	unsigned int target;
+
+	STATS_INC(b->stats.timer);
+
+	if (b->reset_required)
+		vmballoon_reset(b);
+
+	if (b->slow_allocation_cycles > 0)
+		b->slow_allocation_cycles--;
+
+	if (vmballoon_send_get_target(b, &target)) {
+		/* update target, adjust size */
+		b->target = target;
+
+		if (b->size < target)
+			vmballoon_inflate(b);
+		else if (b->size > target)
+			vmballoon_deflate(b);
+	}
+
+	queue_delayed_work(vmballoon_wq, dwork, round_jiffies_relative(HZ));
+}
+
+/*
+ * DEBUGFS Interface
+ */
+#ifdef CONFIG_DEBUG_FS
+
+static int vmballoon_debug_show(struct seq_file *f, void *offset)
+{
+	struct vmballoon *b = f->private;
+	struct vmballoon_stats *stats = &b->stats;
+
+	/* format size info */
+	seq_printf(f,
+		   "target:             %8d pages\n"
+		   "current:            %8d pages\n",
+		   b->target, b->size);
+
+	/* format rate info */
+	seq_printf(f,
+		   "rateNoSleepAlloc:   %8d pages/sec\n"
+		   "rateSleepAlloc:     %8d pages/sec\n"
+		   "rateFree:           %8d pages/sec\n",
+		   VMW_BALLOON_NOSLEEP_ALLOC_MAX,
+		   b->rate_alloc, b->rate_free);
+
+	seq_printf(f,
+		   "\n"
+		   "timer:              %8u\n"
+		   "start:              %8u (%4u failed)\n"
+		   "guestType:          %8u (%4u failed)\n"
+		   "lock:               %8u (%4u failed)\n"
+		   "unlock:             %8u (%4u failed)\n"
+		   "target:             %8u (%4u failed)\n"
+		   "primNoSleepAlloc:   %8u (%4u failed)\n"
+		   "primCanSleepAlloc:  %8u (%4u failed)\n"
+		   "primFree:           %8u\n"
+		   "errAlloc:           %8u\n"
+		   "errFree:            %8u\n",
+		   stats->timer,
+		   stats->start, stats->start_fail,
+		   stats->guest_type, stats->guest_type_fail,
+		   stats->lock,  stats->lock_fail,
+		   stats->unlock, stats->unlock_fail,
+		   stats->target, stats->target_fail,
+		   stats->alloc, stats->alloc_fail,
+		   stats->sleep_alloc, stats->sleep_alloc_fail,
+		   stats->free,
+		   stats->refused_alloc, stats->refused_free);
+
+	return 0;
+}
+
+static int vmballoon_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, vmballoon_debug_show, inode->i_private);
+}
+
+static const struct file_operations vmballoon_debug_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vmballoon_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init vmballoon_debugfs_init(struct vmballoon *b)
+{
+	int error;
+
+	b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
+					   &vmballoon_debug_fops);
+	if (IS_ERR(b->dbg_entry)) {
+		error = PTR_ERR(b->dbg_entry);
+		pr_err("failed to create debugfs entry, error: %d\n", error);
+		return error;
+	}
+
+	return 0;
+}
+
+static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
+{
+	debugfs_remove(b->dbg_entry);
+}
+
+#else
+
+static inline int vmballoon_debugfs_init(struct vmballoon *b)
+{
+	return 0;
+}
+
+static inline void vmballoon_debugfs_exit(struct vmballoon *b)
+{
+}
+
+#endif	/* CONFIG_DEBUG_FS */
+
+static int __init vmballoon_init(void)
+{
+	int error;
+
+	/*
+	 * Check if we are running on VMware's hypervisor and bail out
+	 * if we are not.
+	 */
+	if (!vmware_platform())
+		return -ENODEV;
+
+	vmballoon_wq = create_freezeable_workqueue("vmmemctl");
+	if (!vmballoon_wq) {
+		pr_err("failed to create workqueue\n");
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&balloon.pages);
+	INIT_LIST_HEAD(&balloon.refused_pages);
+
+	/* initialize rates */
+	balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
+	balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
+
+	INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
+
+	/*
+	 * Start balloon.
+	 */
+	if (!vmballoon_send_start(&balloon)) {
+		pr_err("failed to send start command to the host\n");
+		error = -EIO;
+		goto fail;
+	}
+
+	if (!vmballoon_send_guest_id(&balloon)) {
+		pr_err("failed to send guest ID to the host\n");
+		error = -EIO;
+		goto fail;
+	}
+
+	error = vmballoon_debugfs_init(&balloon);
+	if (error)
+		goto fail;
+
+	queue_delayed_work(vmballoon_wq, &balloon.dwork, 0);
+
+	return 0;
+
+fail:
+	destroy_workqueue(vmballoon_wq);
+	return error;
+}
+module_init(vmballoon_init);
+
+static void __exit vmballoon_exit(void)
+{
+	cancel_delayed_work_sync(&balloon.dwork);
+	destroy_workqueue(vmballoon_wq);
+
+	vmballoon_debugfs_exit(&balloon);
+
+	/*
+	 * Deallocate all reserved memory, and reset connection with monitor.
+	 * Reset connection before deallocating memory to avoid potential for
+	 * additional spurious resets from guest touching deallocated pages.
+	 */
+	vmballoon_send_start(&balloon);
+	vmballoon_pop(&balloon);
+}
+module_exit(vmballoon_exit);
-- 
cgit v1.2.3-70-g09d2


From 402af0d7c692ddcfa2333e93d3f275ebd0487926 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 15:21:51 +0100
Subject: x86, asm: Introduce and use percpu_inc()

... generating slightly smaller code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF261F020000780003B33C@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hardirq.h   |  2 +-
 arch/x86/include/asm/percpu.h    | 24 ++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c |  4 ++--
 3 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f8576427cf..aeab29aee61 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
 #define __ARCH_IRQ_STAT
 
-#define inc_irq_stat(member)	percpu_add(irq_stat.member, 1)
+#define inc_irq_stat(member)	percpu_inc(irq_stat.member)
 
 #define local_softirq_pending()	percpu_read(irq_stat.__softirq_pending)
 
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272dfd8b..0ec6d12d84e 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -190,6 +190,29 @@ do {									\
 	pfo_ret__;					\
 })
 
+#define percpu_unary_op(op, var)			\
+({							\
+	switch (sizeof(var)) {				\
+	case 1:						\
+		asm(op "b "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 2:						\
+		asm(op "w "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 4:						\
+		asm(op "l "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 8:						\
+		asm(op "q "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	default: __bad_percpu_size();			\
+	}						\
+})
+
 /*
  * percpu_read() makes gcc load the percpu variable every time it is
  * accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do {									\
 #define percpu_and(var, val)		percpu_to_op("and", var, val)
 #define percpu_or(var, val)		percpu_to_op("or", var, val)
 #define percpu_xor(var, val)		percpu_to_op("xor", var, val)
+#define percpu_inc(var)		percpu_unary_op("inc", var)
 
 #define __this_cpu_read_1(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
 #define __this_cpu_read_2(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767..7a355ddcc64 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	struct mce m;
 	int i;
 
-	__get_cpu_var(mce_poll_count)++;
+	percpu_inc(mce_poll_count);
 
 	mce_setup(&m);
 
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	atomic_inc(&mce_entry);
 
-	__get_cpu_var(mce_exception_count)++;
+	percpu_inc(mce_exception_count);
 
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-- 
cgit v1.2.3-70-g09d2


From 5967ed87ade85a421ef814296c3c7f182b08c225 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 16:08:14 +0100
Subject: x86-64: Reduce SMP locks table size

Reduce the SMP locks table size by using relative pointers instead of
absolute ones, thus cutting the table size by half.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF30FE020000780003B3B6@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/alternative-asm.h |  4 +--
 arch/x86/include/asm/alternative.h     |  4 +--
 arch/x86/kernel/alternative.c          | 45 +++++++++++++++++++---------------
 3 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index b97f786a48d..a63a68be1cc 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -6,8 +6,8 @@
 	.macro LOCK_PREFIX
 1:	lock
 	.section .smp_locks,"a"
-	_ASM_ALIGN
-	_ASM_PTR 1b
+	.balign 4
+	.long 1b - .
 	.previous
 	.endm
 #else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b..714bf241728 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -30,8 +30,8 @@
 #ifdef CONFIG_SMP
 #define LOCK_PREFIX \
 		".section .smp_locks,\"a\"\n"	\
-		_ASM_ALIGN "\n"			\
-		_ASM_PTR "661f\n" /* address */	\
+		".balign 4\n"			\
+		".long 661f - .\n" /* offset */	\
 		".previous\n"			\
 		"661:\n\tlock; "
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d..93673842722 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 }
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
 static void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,39 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 
 #ifdef CONFIG_SMP
 
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+				  u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		text_poke(ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
 
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+				    u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	if (noreplace_smp)
 		return;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		text_poke(ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -276,8 +278,8 @@ struct smp_alt_module {
 	char		*name;
 
 	/* ptrs to lock prefixes */
-	u8		**locks;
-	u8		**locks_end;
+	const s32	*locks;
+	const s32	*locks_end;
 
 	/* .text segment, needed to avoid patching init code ;) */
 	u8		*text;
@@ -398,16 +400,19 @@ void alternatives_smp_switch(int smp)
 int alternatives_text_reserved(void *start, void *end)
 {
 	struct smp_alt_module *mod;
-	u8 **ptr;
+	const s32 *poff;
 	u8 *text_start = start;
 	u8 *text_end = end;
 
 	list_for_each_entry(mod, &smp_alt_modules, next) {
 		if (mod->text > text_end || mod->text_end < text_start)
 			continue;
-		for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
-			if (text_start <= *ptr && text_end >= *ptr)
+		for (poff = mod->locks; poff < mod->locks_end; poff++) {
+			const u8 *ptr = (const u8 *)poff + *poff;
+
+			if (text_start <= ptr && text_end > ptr)
 				return 1;
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 47f9fe26299ae022ac1e3fa12e7e73def62b7898 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 16:19:57 +0100
Subject: x86-64: Don't export init_level4_pgt

It's not used by any module, and i386 (as well as some other arches)
also doesn't export its equivalent (swapper_pg_dir).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF33BD020000780003B3E4@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/x8664_ksyms_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b2249..1b950d151e5 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
 
 EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
 #ifndef CONFIG_PARAVIRT
 EXPORT_SYMBOL(native_load_gs_index);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 30a564be9d9554c168a654eddc2165869cc0d7bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Apr 2010 15:31:36 +0200
Subject: x86, hpet: Restrict read back to affected ATI chipsets

After programming the HPET, we do a readback as a workaround for
ATI/SBx00 chipsets as a synchronization.  Unfortunately this triggers
an erratum in newer ICH chipsets (ICH9+) where reading the comparator
immediately after the write returns the old value.  Furthermore, as
always, I/O reads are bad for performance.

Therefore, restrict the readback to the chipsets that need it, or, for
debugging purposes, when we are running with hpet=verbose.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Venkatesh Pallipadi <venki@google.com>
LKML-Reference: <20100225185348.GA9674@linux-os.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hpet.h |  1 +
 arch/x86/kernel/hpet.c      | 29 +++++++++++++++++------------
 arch/x86/kernel/quirks.c    |  5 +++++
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdf..004e6e25e91 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address;
 extern u8 hpet_blockid;
 extern int hpet_force_user;
 extern u8 hpet_msi_disable;
+extern u8 hpet_readback_cmp;
 extern int is_hpet_enabled(void);
 extern int hpet_enable(void);
 extern void hpet_disable(void);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 23b4ecdffa9..a198b7c87a1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -36,6 +36,7 @@
 unsigned long				hpet_address;
 u8					hpet_blockid; /* OS timer block num */
 u8					hpet_msi_disable;
+u8					hpet_readback_cmp;
 
 #ifdef CONFIG_PCI_MSI
 static unsigned long			hpet_num_timers;
@@ -395,19 +396,23 @@ static int hpet_next_event(unsigned long delta,
 	 * at that point and we would wait for the next hpet interrupt
 	 * forever. We found out that reading the CMP register back
 	 * forces the transfer so we can rely on the comparison with
-	 * the counter register below. If the read back from the
-	 * compare register does not match the value we programmed
-	 * then we might have a real hardware problem. We can not do
-	 * much about it here, but at least alert the user/admin with
-	 * a prominent warning.
-	 * An erratum on some chipsets (ICH9,..), results in comparator read
-	 * immediately following a write returning old value. Workaround
-	 * for this is to read this value second time, when first
-	 * read returns old value.
+	 * the counter register below.
+	 *
+	 * That works fine on those ATI chipsets, but on newer Intel
+	 * chipsets (ICH9...) this triggers due to an erratum: Reading
+	 * the comparator immediately following a write is returning
+	 * the old value.
+	 *
+	 * We restrict the read back to the affected ATI chipsets (set
+	 * by quirks) and also run it with hpet=verbose for debugging
+	 * purposes.
 	 */
-	if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
-		WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
-		  KERN_WARNING "hpet: compare register read back failed.\n");
+	if (hpet_readback_cmp || hpet_verbose) {
+		u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
+
+		if (cmp != cnt)
+			printk_once(KERN_WARNING
+			    "hpet: compare register read back failed.\n");
 	}
 
 	return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 12e9feaa2f7..cd2c336c8d0 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -495,10 +495,15 @@ void force_hpet_resume(void)
 /*
  * HPET MSI on some boards (ATI SB700/SB800) has side effect on
  * floppy DMA. Disable HPET MSI on such platforms.
+ *
+ * Also force the read back of the CMP register in hpet_next_event()
+ * to work around the problem that the CMP register write seems to be
+ * delayed. See hpet_next_event() for details.
  */
 static void force_disable_hpet_msi(struct pci_dev *unused)
 {
 	hpet_msi_disable = 1;
+	hpet_readback_cmp = 1;
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
-- 
cgit v1.2.3-70-g09d2


From bbd391a15d82e14efe9d69ba64cadb855b061dba Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Tue, 27 Apr 2010 11:24:42 -0400
Subject: x86: Fix NULL pointer access in irq_force_complete_move() for Xen
 guests

Upstream PV guests fail to boot because of a NULL pointer in
irq_force_complete_move().  It is possible that xen guests have
irq_desc->chip_data = NULL.

Test for NULL chip_data pointer before attempting to complete an irq move.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
LKML-Reference: <20100427152434.16193.49104.sendpatchset@prarit.bos.redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org> [2.6.33]
---
 arch/x86/kernel/apic/io_apic.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 127b8718abf..eb2789c3f72 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2545,6 +2545,9 @@ void irq_force_complete_move(int irq)
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg = desc->chip_data;
 
+	if (!cfg)
+		return;
+
 	__irq_complete_move(&desc, cfg->vector);
 }
 #else
-- 
cgit v1.2.3-70-g09d2


From 73266fc1df2f94cf72b3beba3eee3b88ed0b0664 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 22 Apr 2010 05:05:45 +0200
Subject: hw-breakpoints: Tag ptrace breakpoint as exclude_kernel

Tag ptrace breakpoints with the exclude_kernel attribute set. This
will make it easier to set generic policies on breakpoints, when it
comes to ensure nobody unpriviliged try to breakpoint on the kernel.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sh/kernel/ptrace_32.c    | 2 +-
 arch/x86/kernel/ptrace.c      | 2 +-
 include/linux/hw_breakpoint.h | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 7759a9a9321..d4104ce9fe5 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -85,7 +85,7 @@ static int set_single_step(struct task_struct *tsk, unsigned long addr)
 
 	bp = thread->ptrace_bps[0];
 	if (!bp) {
-		hw_breakpoint_init(&attr);
+		ptrace_breakpoint_init(&attr);
 
 		attr.bp_addr = addr;
 		attr.bp_len = HW_BREAKPOINT_LEN_2;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 055be0afd33..70c4872cd8a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -688,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 	struct perf_event_attr attr;
 
 	if (!t->ptrace_bps[nr]) {
-		hw_breakpoint_init(&attr);
+		ptrace_breakpoint_init(&attr);
 		/*
 		 * Put stub len and type to register (reserve) an inactive but
 		 * correct bp
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index c70d27af03f..a0aa5a9cfb0 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -34,6 +34,12 @@ static inline void hw_breakpoint_init(struct perf_event_attr *attr)
 	attr->sample_period = 1;
 }
 
+static inline void ptrace_breakpoint_init(struct perf_event_attr *attr)
+{
+	hw_breakpoint_init(attr);
+	attr->exclude_kernel = 1;
+}
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
-- 
cgit v1.2.3-70-g09d2


From b2812d031dea86926e9c10f7714af33ac2f6b43d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 18 Apr 2010 18:11:53 +0200
Subject: hw-breakpoints: Change/Enforce some breakpoints policies

The current policies of breakpoints in x86 and SH are the following:

- task bound breakpoints can only break on userspace addresses
- cpu wide breakpoints can only break on kernel addresses

The former rule prevents ptrace breakpoints to be set to trigger on
kernel addresses, which is good. But as a side effect, we can't
breakpoint on kernel addresses for task bound breakpoints.

The latter rule simply makes no sense, there is no reason why we
can't set breakpoints on userspace while performing cpu bound
profiles.

We want the following new policies:

- task bound breakpoint can set userspace address breakpoints, with
no particular privilege required.
- task bound breakpoints can set kernelspace address breakpoints but
must be privileged to do that.
- cpu bound breakpoints can do what they want as they are privileged
already.

To implement these new policies, this patch checks if we are dealing
with a kernel address breakpoint, if so and if the exclude_kernel
parameter is set, we tell the user that the breakpoint is invalid,
which makes a good generic ptrace protection.
If we don't have exclude_kernel, ensure the user has the right
privileges as kernel breakpoints are quite sensitive (risk of
trap recursion attacks and global performance impacts).

[ Paul Mundt: keep addr space check for sh signal delivery and fix
  double function declaration]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/hw_breakpoint.h  |  5 ++---
 arch/sh/kernel/hw_breakpoint.c       | 34 ++++++------------------------
 arch/x86/include/asm/hw_breakpoint.h |  5 ++---
 arch/x86/kernel/hw_breakpoint.c      | 41 ++++++------------------------------
 kernel/hw_breakpoint.c               | 26 +++++++++++++++++++++--
 5 files changed, 41 insertions(+), 70 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 965dd780d51..382bad937dc 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -47,9 +47,8 @@ struct pmu;
 #define HBP_NUM		2
 
 /* arch/sh/kernel/hw_breakpoint.c */
-extern int arch_check_va_in_userspace(unsigned long va, u16 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index 675eea7785d..1f2cf622986 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -119,26 +119,17 @@ static int get_hbp_len(u16 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u16 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -226,8 +217,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -270,15 +260,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
@@ -363,8 +344,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 		perf_bp_event(bp, args->regs);
 
 		/* Deliver the signal to userspace */
-		if (arch_check_va_in_userspace(bp->attr.bp_addr,
-					       bp->attr.bp_len)) {
+		if (!arch_check_bp_in_kernelspace(bp)) {
 			siginfo_t info;
 
 			info.si_signo = args->signr;
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 2a1bd8f4f23..c77a5a6fab9 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -44,9 +44,8 @@ struct arch_hw_breakpoint {
 struct perf_event;
 struct pmu;
 
-extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519..a8f1b803d2f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -188,26 +188,17 @@ static int get_hbp_len(u8 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 
 	ret = -EINVAL;
 
-	if (info->type == X86_BREAKPOINT_EXECUTE)
-		/*
-		 * Ptrace-refactoring code
-		 * For now, we'll allow instruction breakpoint only for user-space
-		 * addresses
-		 */
-		if ((!arch_check_va_in_userspace(info->address, info->len)) &&
-			info->len != X86_BREAKPOINT_EXECUTE)
-			return ret;
-
 	switch (info->len) {
 	case X86_BREAKPOINT_LEN_1:
 		align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 9ed9ae3a48b..89e8a050c43 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -308,6 +308,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
 	return 0;
 }
 
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+	int ret;
+
+	ret = arch_validate_hwbkpt_settings(bp);
+	if (ret)
+		return ret;
+
+	if (arch_check_bp_in_kernelspace(bp)) {
+		if (bp->attr.exclude_kernel)
+			return -EINVAL;
+		/*
+		 * Don't let unprivileged users set a breakpoint in the trap
+		 * path to avoid trap recursion attacks.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
 	int ret;
@@ -316,7 +338,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	ret = validate_hw_breakpoint(bp);
 
 	/* if arch_validate_hwbkpt_settings() fails then release bp slot */
 	if (ret)
@@ -363,7 +385,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	if (attr->disabled)
 		goto end;
 
-	err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	err = validate_hw_breakpoint(bp);
 	if (!err)
 		perf_event_enable(bp);
 
-- 
cgit v1.2.3-70-g09d2


From b810e94c9d8e3fff6741b66cd5a6f099a7887871 Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Wed, 31 Mar 2010 21:56:45 +0200
Subject: powernow-k8: Fix frequency reporting

With F10, model 10, all valid frequencies are in the ACPI _PST table.

Cc: <stable@kernel.org> # 33.x 32.x
Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <1270065406-1814-6-git-send-email-bp@amd64.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d360b56e982..b6215b9798e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -929,7 +929,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 		powernow_table[i].index = index;
 
 		/* Frequency may be rounded for these */
-		if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
+				 || boot_cpu_data.x86 == 0x11) {
 			powernow_table[i].frequency =
 				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
 		} else
-- 
cgit v1.2.3-70-g09d2


From 40d2e76315da38993129090dc5d56377e573c312 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:43 -0400
Subject: x86-32: Rework cache flush denied handler

The cache flush denied error is an erratum on some AMD 486 clones.  If an invd
instruction is executed in userspace, the processor calls exception 19 (13 hex)
instead of #GP (13 decimal).  On cpus where XMM is not supported, redirect
exception 19 to do_general_protection().  Also, remove die_if_kernel(), since
this was the last user.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-2-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig.cpu       |  4 ++++
 arch/x86/kernel/entry_32.S | 19 +++++++++++++++++++
 arch/x86/kernel/traps.c    | 31 +++----------------------------
 3 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6..6f6792c5601 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -338,6 +338,10 @@ config X86_F00F_BUG
 	def_bool y
 	depends on M586MMX || M586TSC || M586 || M486 || M386
 
+config X86_INVD_BUG
+	def_bool y
+	depends on M486 || M386
+
 config X86_WP_WORKS_OK
 	def_bool y
 	depends on !M386
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc673..cd49141cf15 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
 	RING0_INT_FRAME
 	pushl $0
 	CFI_ADJUST_CFA_OFFSET 4
+#ifdef CONFIG_X86_INVD_BUG
+	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661:	pushl $do_general_protection
+662:
+.section .altinstructions,"a"
+	.balign 4
+	.long 661b
+	.long 663f
+	.byte X86_FEATURE_XMM
+	.byte 662b-661b
+	.byte 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663:	pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
 	pushl $do_simd_coprocessor_error
+#endif
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..a16c9dfe6b7 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,15 +108,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-#ifdef CONFIG_X86_32
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
-	if (!user_mode_vm(regs))
-		die(str, regs, err);
-}
-#endif
-
 static void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
@@ -729,30 +720,14 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	conditional_sti(regs);
 
 #ifdef CONFIG_X86_32
-	if (cpu_has_xmm) {
-		/* Handle SIMD FPU exceptions on PIII+ processors. */
-		ignore_fpu_irq = 1;
-		simd_math_error((void __user *)regs->ip);
-		return;
-	}
-	/*
-	 * Handle strange cache flush from user space exception
-	 * in all other cases.  This is undocumented behaviour.
-	 */
-	if (regs->flags & X86_VM_MASK) {
-		handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
-		return;
-	}
-	current->thread.trap_no = 19;
-	current->thread.error_code = error_code;
-	die_if_kernel("cache flush denied", regs, error_code);
-	force_sig(SIGSEGV, current);
+	ignore_fpu_irq = 1;
 #else
 	if (!user_mode(regs) &&
 			kernel_math_error(regs, "kernel simd math error", 19))
 		return;
-	simd_math_error((void __user *)regs->ip);
 #endif
+
+	simd_math_error((void __user *)regs->ip);
 }
 
 dotraplinkage void
-- 
cgit v1.2.3-70-g09d2


From 9b6dba9e0798325dab427b9d60c61630ccc39b28 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:44 -0400
Subject: x86: Merge simd_math_error() into math_error()

The only difference between FPU and SIMD exceptions is where the
status bits are read from (cwd/swd vs. mxcsr).  This also fixes
the discrepency introduced by commit adf77bac, which fixed FPU
but not SIMD.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-3-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/traps.h |   2 +-
 arch/x86/kernel/irqinit.c    |   2 +-
 arch/x86/kernel/traps.c      | 100 ++++++++++++++-----------------------------
 3 files changed, 34 insertions(+), 70 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4da91ad69e0..f66cda56781 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -79,7 +79,7 @@ static inline int get_si_code(unsigned long condition)
 
 extern int panic_on_unrecovered_nmi;
 
-void math_error(void __user *);
+void math_error(struct pt_regs *, int, int);
 void math_emulate(struct math_emu_info *);
 #ifndef CONFIG_X86_32
 asmlinkage void smp_thermal_interrupt(void);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd4..990ae7cfc57 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 	outb(0, 0xF0);
 	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
 		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
+	math_error(get_irq_regs(), 0, 16);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a16c9dfe6b7..a472992cecd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -595,36 +595,48 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void __user *ip)
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task;
 	siginfo_t info;
-	unsigned short cwd, swd, err;
+	unsigned short err;
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
 	task = current;
 	save_init_fpu(task);
-	task->thread.trap_no = 16;
-	task->thread.error_code = 0;
+	task->thread.trap_no = trapnr;
+	task->thread.error_code = error_code;
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
-	info.si_addr = ip;
-	/*
-	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-	 * status.  0x3f is the exception bits in these regs, 0x200 is the
-	 * C1 reg you need in case of a stack fault, 0x040 is the stack
-	 * fault bit.  We should only be taking one exception at a time,
-	 * so if this combination doesn't produce any single exception,
-	 * then we have a bad program that isn't synchronizing its FPU usage
-	 * and it will suffer the consequences since we won't be able to
-	 * fully reproduce the context of the exception
-	 */
-	cwd = get_fpu_cwd(task);
-	swd = get_fpu_swd(task);
+	info.si_addr = (void __user *)regs->ip;
+	if (trapnr == 16) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(task);
+		swd = get_fpu_swd(task);
 
-	err = swd & ~cwd;
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(task);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
 
 	if (err & 0x001) {	/* Invalid op */
 		/*
@@ -663,55 +675,7 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	math_error((void __user *)regs->ip);
-}
-
-static void simd_math_error(void __user *ip)
-{
-	struct task_struct *task;
-	siginfo_t info;
-	unsigned short mxcsr;
-
-	/*
-	 * Save the info for the exception handler and clear the error.
-	 */
-	task = current;
-	save_init_fpu(task);
-	task->thread.trap_no = 19;
-	task->thread.error_code = 0;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT;
-	info.si_addr = ip;
-	/*
-	 * The SIMD FPU exceptions are handled a little differently, as there
-	 * is only a single status/control register.  Thus, to determine which
-	 * unmasked exception was caught we must mask the exception mask bits
-	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-	 */
-	mxcsr = get_fpu_mxcsr(task);
-	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-	case 0x000:
-	default:
-		break;
-	case 0x001: /* Invalid Op */
-		info.si_code = FPE_FLTINV;
-		break;
-	case 0x002: /* Denormalize */
-	case 0x010: /* Underflow */
-		info.si_code = FPE_FLTUND;
-		break;
-	case 0x004: /* Zero Divide */
-		info.si_code = FPE_FLTDIV;
-		break;
-	case 0x008: /* Overflow */
-		info.si_code = FPE_FLTOVF;
-		break;
-	case 0x020: /* Precision */
-		info.si_code = FPE_FLTRES;
-		break;
-	}
-	force_sig_info(SIGFPE, &info, task);
+	math_error(regs, error_code, 16);
 }
 
 dotraplinkage void
@@ -727,7 +691,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	simd_math_error((void __user *)regs->ip);
+	math_error(regs, error_code, 19);
 }
 
 dotraplinkage void
-- 
cgit v1.2.3-70-g09d2


From e2e75c915de045f0785387dc32f55e92fab0614c Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:45 -0400
Subject: x86: Merge kernel_math_error() into math_error()

Clean up the kernel exception handling and make it more similar to
the other traps.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-4-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a472992cecd..53ba86fc5dd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -576,20 +576,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	return;
 }
 
-#ifdef CONFIG_X86_64
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
-	if (fixup_exception(regs))
-		return 1;
-
-	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
-	/* Illegal floating point operation in the kernel */
-	current->thread.trap_no = trapnr;
-	die(str, regs, 0);
-	return 0;
-}
-#endif
-
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
  * the correct behaviour even in the presence of the asynchronous
@@ -597,14 +583,28 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  */
 void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
-	struct task_struct *task;
+	struct task_struct *task = current;
 	siginfo_t info;
 	unsigned short err;
+	char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+		return;
+	conditional_sti(regs);
+
+	if (!user_mode_vm(regs))
+	{
+		if (!fixup_exception(regs)) {
+			task->thread.error_code = error_code;
+			task->thread.trap_no = trapnr;
+			die(str, regs, error_code);
+		}
+		return;
+	}
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	task = current;
 	save_init_fpu(task);
 	task->thread.trap_no = trapnr;
 	task->thread.error_code = error_code;
@@ -665,14 +665,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-	    kernel_math_error(regs, "kernel x87 math error", 16))
-		return;
 #endif
 
 	math_error(regs, error_code, 16);
@@ -681,14 +675,8 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-			kernel_math_error(regs, "kernel simd math error", 19))
-		return;
 #endif
 
 	math_error(regs, error_code, 19);
-- 
cgit v1.2.3-70-g09d2


From 250825008f1f94887bc039e9227a8adfb5ba366e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:46 -0400
Subject: x86-32: Don't set ignore_fpu_irq in simd exception

Any processor that supports simd will have an internal fpu, and the
irq13 handler will not be enabled.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-5-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 53ba86fc5dd..00516e1de55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -675,10 +675,6 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
-	ignore_fpu_irq = 1;
-#endif
-
 	math_error(regs, error_code, 19);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 097c1bd5673edaf2a162724636858b71f658fdd2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 3 May 2010 15:49:31 -0700
Subject: x86, cpu: Make APERF/MPERF a normal table-driven flag

APERF/MPERF can be handled via the table like all the other scattered
CPU flags.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index fd1fc1902a4..10fa5684a66 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,13 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
-		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
-		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
-		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
-		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-		{ X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
+		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
+		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
+		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
+		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
+		{ X86_FEATURE_SVML,  		CR_EDX, 2, 0x8000000a },
+		{ X86_FEATURE_NRIPS, 		CR_EDX, 3, 0x8000000a },
 		{ 0, 0, 0, 0 }
 	};
 
@@ -54,14 +55,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
-
-	/*
-	 * common AMD/Intel features
-	 */
-	if (c->cpuid_level >= 6) {
-		if (cpuid_ecx(6) & 0x1)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
 }
 
 /* leaf 0xb SMT level */
-- 
cgit v1.2.3-70-g09d2


From 2c2df8418ac7908eec4558407b83f16739006c54 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:02 -0700
Subject: x86, acpi/irq: Introduce apci_isa_irq_to_gsi

There are a number of cases where the current code makes the assumption
that isa irqs identity map to the first 16 acpi global system intereupts.
In most instances that assumption is correct as that is the required
behaviour in dual i8259 mode and the default behavior in ioapic mode.

However there are some systems out there that take advantage of acpis
interrupt remapping  for the isa irqs to have a completely different
mapping of isa_irq to gsi.

Introduce acpi_isa_irq_to_gsi to perform this mapping explicitly in the
code that needs it.  Initially this will be just the current assumed
identity mapping to ensure it's introduction does not cause regressions.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-1-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/ia64/kernel/acpi.c     | 8 ++++++++
 arch/x86/kernel/acpi/boot.c | 8 ++++++++
 include/linux/acpi.h        | 1 +
 3 files changed, 17 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 4d1a7e9314c..c6c90f39f4d 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -785,6 +785,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  *  ACPI based hotplug CPU support
  */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa9..da718d67259 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -458,6 +458,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  * success: return IRQ number (>=0)
  * failure: return < 0
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b926afe8c03..7a937dabcc4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -116,6 +116,7 @@ extern unsigned long acpi_realmode_flags;
 
 int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
+int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
-- 
cgit v1.2.3-70-g09d2


From 9a0a91bb56d2915cdb8585717de38376ad20fef9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:03 -0700
Subject: x86, acpi/irq: Teach acpi_get_override_irq to take a gsi not an
 isa_irq

In perverse acpi implementations the isa irqs are not identity mapped
to the first 16 gsi.  Furthermore at least the extended interrupt
resource capability may return gsi's and not isa irqs.  So since
what we get from acpi is a gsi teach acpi_get_overrride_irq to
operate on a gsi instead of an isa_irq.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-2-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 23 ++++++++++++++---------
 include/linux/acpi.h           |  4 ++--
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 127b8718abf..73ec92838d8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4082,22 +4082,27 @@ int __init io_apic_get_version(int ioapic)
 	return reg_01.bits.version;
 }
 
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
 {
-	int i;
+	int ioapic, pin, idx;
 
 	if (skip_ioapic_setup)
 		return -1;
 
-	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].irqtype == mp_INT &&
-		    mp_irqs[i].srcbusirq == bus_irq)
-			break;
-	if (i >= mp_irq_entries)
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	if (pin < 0)
+		return -1;
+
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if (idx < 0)
 		return -1;
 
-	*trigger = irq_trigger(i);
-	*polarity = irq_polarity(i);
+	*trigger = irq_trigger(idx);
+	*polarity = irq_polarity(idx);
 	return 0;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7a937dabcc4..3da73f5f0ae 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,9 +119,9 @@ int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 #ifdef CONFIG_X86_IO_APIC
-extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
+extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #else
-#define acpi_get_override_irq(bus, trigger, polarity) (-1)
+#define acpi_get_override_irq(gsi, trigger, polarity) (-1)
 #endif
 /*
  * This function undoes the effect of one call to acpi_register_gsi().
-- 
cgit v1.2.3-70-g09d2


From 9d2062b879495649bb525cf7979126da2e45d288 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:05 -0700
Subject: x86, acpi/irq: Fix acpi_sci_ioapic_setup so it has both bus_irq and
 gsi

Currently acpi_sci_ioapic_setup calls mp_override_legacy_irq with
bus_irq == gsi, which is wrong if we are comming from an override
Instead pass the bus_irq into acpi_sci_ioapic_setup.

This fix was inspired by a similar fix from:
Yinghai Lu <yinghai@kernel.org>

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-4-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index da718d67259..0a036dc6f9f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -313,7 +313,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 /*
  * Parse Interrupt Source Override for the ACPI SCI
  */
-static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
 {
 	if (trigger == 0)	/* compatible SCI trigger is level */
 		trigger = 3;
@@ -333,7 +333,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
 	 * If GSI is < 16, this will update its flags,
 	 * else it will create a new mp_irqs[] entry.
 	 */
-	mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
 
 	/*
 	 * stash over-ride to indicate we've been here
@@ -357,9 +357,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 	acpi_table_print_madt_entry(header);
 
 	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
-		acpi_sci_ioapic_setup(intsrc->global_irq,
+		acpi_sci_ioapic_setup(intsrc->source_irq,
 				      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
-				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+				      intsrc->global_irq);
 		return 0;
 	}
 
@@ -1162,7 +1163,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	 * pretend we got one so we can set the SCI flags.
 	 */
 	if (!acpi_sci_override_gsi)
-		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+				      acpi_gbl_FADT.sci_interrupt);
 
 	/* Fill in identity legacy mappings where no override */
 	mp_config_acpi_legacy_irqs();
-- 
cgit v1.2.3-70-g09d2


From 0fd52670fb6400be0996ac492b5ed77f3d83d69a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:06 -0700
Subject: x86, acpi/irq: Generalize mp_config_acpi_legacy_irqs

Remove the assumption that there is not an override for isa irq 0.
Instead lookup the gsi and from that lookup the ioapic and pin of each
isa irq indivdually.

In general this should not have any behavioural affect but in
perverse cases this gets all of the details correct, instead of
doing something weird.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-5-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0a036dc6f9f..3ee92f28a4b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -961,8 +961,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 void __init mp_config_acpi_legacy_irqs(void)
 {
 	int i;
-	int ioapic;
-	unsigned int dstapic;
 	struct mpc_intsrc mp_irq;
 
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -982,20 +980,28 @@ void __init mp_config_acpi_legacy_irqs(void)
 		return;
 #endif
 
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-	dstapic = mp_ioapics[ioapic].apicid;
-
 	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
 	 * overridden by (MADT) interrupt source override entries.
 	 */
 	for (i = 0; i < 16; i++) {
+		int ioapic, pin;
+		unsigned int dstapic;
 		int idx;
+		u32 gsi;
+
+		/* Locate the gsi that irq i maps to. */
+		if (acpi_isa_irq_to_gsi(i, &gsi))
+			continue;
+
+		/*
+		 * Locate the IOAPIC that manages the ISA IRQ.
+		 */
+		ioapic = mp_find_ioapic(gsi);
+		if (ioapic < 0)
+			continue;
+		pin = mp_find_ioapic_pin(ioapic, gsi);
+		dstapic = mp_ioapics[ioapic].apicid;
 
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1005,7 +1011,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 				break;
 
 			/* Do we already have a mapping for this IOAPIC pin */
-			if (irq->dstapic == dstapic && irq->dstirq == i)
+			if (irq->dstapic == dstapic && irq->dstirq == pin)
 				break;
 		}
 
@@ -1020,7 +1026,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 		mp_irq.dstapic = dstapic;
 		mp_irq.irqtype = mp_INT;
 		mp_irq.srcbusirq = i; /* Identity mapped */
-		mp_irq.dstirq = i;
+		mp_irq.dstirq = pin;
 
 		save_mp_irq(&mp_irq);
 	}
-- 
cgit v1.2.3-70-g09d2


From 4b6b19a1c7302477653d799a53d48063dd53d555 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:08 -0700
Subject: x86, ioapic: Fix io_apic_redir_entries to return the number of
 entries.

io_apic_redir_entries has a huge conceptual bug.  It returns the maximum
redirection entry not the number of redirection entries.  Which simply
does not match what the name of the function.  This just caught me
and it caught  Feng Tang, and  Len Brown when they wrote sfi_parse_ioapic.

Modify io_apic_redir_entries to actually return the number of redirection
entries, and fix the callers so that they properly handle receiving the
number of the number of redirection table entries, instead of the
number of redirection table entries less one.

While the usage in sfi.c does not show up in this patch it is fixed
by virtue of the fact that io_apic_redir_entries now has the semantics
sfi_parse_ioapic most reasonably expects.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-7-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 73ec92838d8..0a053e61b3e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3855,7 +3855,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 	reg_01.raw = io_apic_read(ioapic, 1);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	return reg_01.bits.entries;
+	/* The register returns the maximum index redir index
+	 * supported, which is one less than the total number of redir
+	 * entries.
+	 */
+	return reg_01.bits.entries + 1;
 }
 
 void __init probe_nr_irqs_gsi(void)
@@ -3871,7 +3875,7 @@ void __init probe_nr_irqs_gsi(void)
 
 		nr = 0;
 		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx) + 1;
+			nr += io_apic_get_redir_entries(idx);
 
 		if (nr > nr_irqs_gsi)
 			nr_irqs_gsi = nr;
@@ -4306,7 +4310,7 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 */
 	mp_gsi_routing[idx].gsi_base = gsi_base;
 	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
+	    io_apic_get_redir_entries(idx) - 1;
 
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
-- 
cgit v1.2.3-70-g09d2


From eddb0c55a14074d6bac8c2ef169aefd7e2c6f139 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:09 -0700
Subject: x86, ioapic: Fix the types of gsi values

This patches fixes the types of gsi_base and gsi_end values in
struct mp_ioapic_gsi, and the gsi parameter of mp_find_ioapic
and mp_find_ioapic_pin

A gsi is cannonically a u32, not an int.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-8-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h | 10 +++++-----
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 35832a03a51..feeaf0d9246 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -180,12 +180,12 @@ extern void ioapic_write_entry(int apic, int pin,
 extern void setup_ioapic_ids_from_mpc(void);
 
 struct mp_ioapic_gsi{
-	int gsi_base;
-	int gsi_end;
+	u32 gsi_base;
+	u32 gsi_end;
 };
 extern struct mp_ioapic_gsi  mp_gsi_routing[];
-int mp_find_ioapic(int gsi);
-int mp_find_ioapic_pin(int ioapic, int gsi);
+int mp_find_ioapic(u32 gsi);
+int mp_find_ioapic_pin(int ioapic, u32 gsi);
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void __init pre_init_apic_IRQ0(void);
 
@@ -197,7 +197,7 @@ static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)	{ }
 static inline void ioapic_insert_resources(void) { }
 static inline void probe_nr_irqs_gsi(void)	{ }
-static inline int mp_find_ioapic(int gsi) { return 0; }
+static inline int mp_find_ioapic(u32 gsi) { return 0; }
 
 struct io_apic_irq_attr;
 static inline int io_apic_set_pci_routing(struct device *dev, int irq,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0a053e61b3e..9ab97622b8e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4247,7 +4247,7 @@ void __init ioapic_insert_resources(void)
 	}
 }
 
-int mp_find_ioapic(int gsi)
+int mp_find_ioapic(u32 gsi)
 {
 	int i = 0;
 
@@ -4262,7 +4262,7 @@ int mp_find_ioapic(int gsi)
 	return -1;
 }
 
-int mp_find_ioapic_pin(int ioapic, int gsi)
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
 	if (WARN_ON(ioapic == -1))
 		return -1;
-- 
cgit v1.2.3-70-g09d2


From 5777372af5c929b8f3c706ed7b295b7279537c88 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:10 -0700
Subject: x86, ioapic: Teach mp_register_ioapic to compute a global gsi_end

Add the global variable gsi_end and teach mp_register_ioapic
to keep it uptodate as we add more ioapics into the system.

ioapics can only be added early in boot so the code that
runs later can treat gsi_end as a constant.

Remove the have hacks in sfi.c to second guess mp_register_ioapic
by keeping t's own running total of how many gsi's have been seen,
and instead use the gsi_end.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-9-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h | 1 +
 arch/x86/kernel/apic/io_apic.c | 6 ++++++
 arch/x86/kernel/sfi.c          | 4 +---
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index feeaf0d9246..37b0f2bb503 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -184,6 +184,7 @@ struct mp_ioapic_gsi{
 	u32 gsi_end;
 };
 extern struct mp_ioapic_gsi  mp_gsi_routing[];
+extern u32 gsi_end;
 int mp_find_ioapic(u32 gsi);
 int mp_find_ioapic_pin(int ioapic, u32 gsi);
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9ab97622b8e..f8072557157 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
 /* IO APIC gsi routing info */
 struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
 
+/* The last gsi number used */
+u32 gsi_end;
+
 /* MP IRQ source entries */
 struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
@@ -4312,6 +4315,9 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	mp_gsi_routing[idx].gsi_end = gsi_base +
 	    io_apic_get_redir_entries(idx) - 1;
 
+	if (mp_gsi_routing[idx].gsi_end > gsi_end)
+		gsi_end = mp_gsi_routing[idx].gsi_end;
+
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
 	       mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e09938265..7ded57896c0 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-static u32 gsi_base;
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	pentry = (struct sfi_apic_table_entry *)sb->pentry;
 
 	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_base);
-		gsi_base += io_apic_get_redir_entries(i);
+		mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
 		pentry++;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From cf7500c0ea133d66f8449d86392d83f840102632 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:11 -0700
Subject: x86, ioapic: In mpparse use mp_register_ioapic

Long ago MP_ioapic_info was the primary way of setting up our
ioapic data structures and mp_register_ioapic was a compatibility
shim for acpi code.  Now the situation is reversed and
and mp_register_ioapic is the primary way of setting up our
ioapic data structures.

Keep the setting up of ioapic data structures uniform by
having mp_register_ioapic call mp_register_ioapic.

This changes a few fields:

- type: is now hardset to MP_IOAPIC but type had to
  bey MP_IOAPIC or MP_ioapic_info would not have been called.

- flags: is now hard coded to MPC_APIC_USABLE.
  We require flags to contain at least MPC_APIC_USEBLE in
  MP_ioapic_info and we don't ever examine flags so dropping
  a few flags that might possibly exist that we have never
  used is harmless.

- apicaddr: Unchanged

- apicver: Read from the ioapic instead of using the cached
  hardware value in the MP table.  The real hardware value
  will be more accurate.

- apicid: Now verified to be unique and changed if it is not.
  If the BIOS got this right this is a noop.  If the BIOS did
  not fixing things appears to be the better solution.

This adds gsi_base and gsi_end values to our ioapics defined with
the mpatable, which will make our lives simpler later since
we can always assume gsi_base and gsi_end are valid.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-10-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/mpparse.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8..5ae5d2426ed 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
 
-static int bad_ioapic(unsigned long address)
-{
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-	}
-	if (!address) {
-		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-		       " found in table, skipping!\n");
-		return 1;
-	}
-	return 0;
-}
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
 	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
 	       m->apicid, m->apicver, m->apicaddr);
 
-	if (bad_ioapic(m->apicaddr))
-		return;
-
-	mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
-	mp_ioapics[nr_ioapics].apicid = m->apicid;
-	mp_ioapics[nr_ioapics].type = m->type;
-	mp_ioapics[nr_ioapics].apicver = m->apicver;
-	mp_ioapics[nr_ioapics].flags = m->flags;
-	nr_ioapics++;
+	mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
 }
 
 static void print_MP_intsrc_info(struct mpc_intsrc *m)
-- 
cgit v1.2.3-70-g09d2


From 7716a5c4ff5f1f3dc5e9edcab125cbf7fceef0af Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:12 -0700
Subject: x86, ioapic: Move nr_ioapic_registers calculation to
 mp_register_ioapic.

Now that all ioapic registration happens in mp_register_ioapic we can
move the calculation of nr_ioapic_registers there from enable_IO_APIC.
The number of ioapic registers is already calucated in mp_register_ioapic
so all that really needs to be done is to save the caluclated value
in nr_ioapic_registers.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-11-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f8072557157..dae9240bd28 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1953,20 +1953,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 void __init enable_IO_APIC(void)
 {
-	union IO_APIC_reg_01 reg_01;
 	int i8259_apic, i8259_pin;
 	int apic;
-	unsigned long flags;
-
-	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(apic, 1);
-		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-	}
 
 	if (!legacy_pic->nr_legacy_irqs)
 		return;
@@ -4293,6 +4281,7 @@ static int bad_ioapic(unsigned long address)
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
+	int entries;
 
 	if (bad_ioapic(address))
 		return;
@@ -4311,9 +4300,14 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
+	entries = io_apic_get_redir_entries(idx);
 	mp_gsi_routing[idx].gsi_base = gsi_base;
-	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx) - 1;
+	mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	nr_ioapic_registers[idx] = entries;
 
 	if (mp_gsi_routing[idx].gsi_end > gsi_end)
 		gsi_end = mp_gsi_routing[idx].gsi_end;
-- 
cgit v1.2.3-70-g09d2


From d464207c4fdd70c2a0febd4f9c58206fa915bb36 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:13 -0700
Subject: x86, ioapic: Optimize pin_2_irq

Now that all ioapics have valid gsi_base values use this to
accellerate pin_2_irq.  In the case of acpi this also ensures
that pin_2_irq will compute the same irq value for an ioapic
pin as acpi will.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-12-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dae9240bd28..0d35f46929d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1019,7 +1019,7 @@ static inline int irq_trigger(int idx)
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
-	int irq, i;
+	int irq;
 	int bus = mp_irqs[idx].srcbus;
 
 	/*
@@ -1031,18 +1031,13 @@ static int pin_2_irq(int idx, int apic, int pin)
 	if (test_bit(bus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
-		/*
-		 * PCI IRQs are mapped in order
-		 */
-		i = irq = 0;
-		while (i < apic)
-			irq += nr_ioapic_registers[i++];
-		irq += pin;
+		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
 		/*
                  * For MPS mode, so far only needed by ES7000 platform
                  */
 		if (ioapic_renumber_irq)
-			irq = ioapic_renumber_irq(apic, irq);
+			gsi = ioapic_renumber_irq(apic, gsi);
+		irq = gsi;
 	}
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3-70-g09d2


From 4afc51a835d3aeba11c35090f524e05c84586d27 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:14 -0700
Subject: x86, ioapic: Simplify probe_nr_irqs_gsi.

Use the global gsi_end value now that all ioapics have
valid gsi numbers instead of a combination of acpi_probe_gsi
and walking all of the ioapics and couting their number of
entries by hand if acpi_probe_gsi gave us an answer we did
not like.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-13-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mpspec.h  |  6 ------
 arch/x86/kernel/acpi/boot.c    | 23 -----------------------
 arch/x86/kernel/apic/io_apic.c | 17 +++--------------
 3 files changed, 3 insertions(+), 43 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 29994f06c7e..c82868e9f90 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -105,12 +105,6 @@ extern void mp_config_acpi_legacy_irqs(void);
 struct device;
 extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
-extern int acpi_probe_gsi(void);
-#else /* !CONFIG_ACPI: */
-static inline int acpi_probe_gsi(void)
-{
-	return 0;
-}
 #endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3ee92f28a4b..07a63ce5811 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -876,29 +876,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
 
-int __init acpi_probe_gsi(void)
-{
-	int idx;
-	int gsi;
-	int max_gsi = 0;
-
-	if (acpi_disabled)
-		return 0;
-
-	if (!acpi_ioapic)
-		return 0;
-
-	max_gsi = 0;
-	for (idx = 0; idx < nr_ioapics; idx++) {
-		gsi = mp_gsi_routing[idx].gsi_end;
-
-		if (gsi > max_gsi)
-			max_gsi = gsi;
-	}
-
-	return max_gsi + 1;
-}
-
 static void assign_to_mp_irq(struct mpc_intsrc *m,
 				    struct mpc_intsrc *mp_irq)
 {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0d35f46929d..9f3f6ca86da 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3850,22 +3850,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 
 void __init probe_nr_irqs_gsi(void)
 {
-	int nr = 0;
+	int nr;
 
-	nr = acpi_probe_gsi();
-	if (nr > nr_irqs_gsi) {
+	nr = gsi_end + 1;
+	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
-	} else {
-		/* for acpi=off or acpi is not compiled in */
-		int idx;
-
-		nr = 0;
-		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx);
-
-		if (nr > nr_irqs_gsi)
-			nr_irqs_gsi = nr;
-	}
 
 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
-- 
cgit v1.2.3-70-g09d2


From 988856ee1623bd37e384105f7bb2b7fe44c009f6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:15 -0700
Subject: x86, acpi/irq: Handle isa irqs that are not identity mapped to gsi's.

ACPI irq source overrides are allowed for the 16 isa irqs and are
allowed to map any gsi to any isa irq.  A few motherboards have been
seen to take advantage of this and put the isa irqs on the 2nd or
3rd ioapic.  This causes some problems, most notably the fact
that we can not use any gsi < 16.

To correct this move the gsis that are not isa irqs and have
a gsi number < 16 into the linux irq space just past gsi_end.
This is what the es7000 platform is doing today.  Moving only the
low 16 gsis above the rest of the gsi's only penalizes weird
platforms, leaving sane acpi implementations with a 1-1 mapping
of gsis and irqs.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-14-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c    | 57 +++++++++++++++++++++++++++++++++++++++---
 arch/x86/kernel/apic/io_apic.c |  8 ++++--
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 07a63ce5811..325fbbab7f8 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -93,6 +93,53 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
 
 
+/*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static unsigned int gsi_to_irq(unsigned int gsi)
+{
+	unsigned int irq = gsi + NR_IRQS_LEGACY;
+	unsigned int i;
+
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
+		if (isa_irq_to_gsi[i] == gsi) {
+			return i;
+		}
+	}
+
+	/* Provide an identity mapping of gsi == irq
+	 * except on truly weird platforms that have
+	 * non isa irqs in the first 16 gsis.
+	 */
+	if (gsi >= NR_IRQS_LEGACY)
+		irq = gsi;
+	else
+		irq = gsi_end + 1 + gsi;
+
+	return irq;
+}
+
+static u32 irq_to_gsi(int irq)
+{
+	unsigned int gsi;
+
+	if (irq < NR_IRQS_LEGACY)
+		gsi = isa_irq_to_gsi[irq];
+	else if (irq <= gsi_end)
+		gsi = irq;
+	else if (irq <= (gsi_end + NR_IRQS_LEGACY))
+		gsi = irq - gsi_end;
+	else
+		gsi = 0xffffffff;
+
+	return gsi;
+}
+
 /*
  * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
  * to map the target physical address. The problem is that set_fixmap()
@@ -449,7 +496,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
 
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-	*irq = gsi;
+	*irq = gsi_to_irq(gsi);
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -463,7 +510,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
 	if (isa_irq >= 16)
 		return -1;
-	*gsi = isa_irq;
+	*gsi = irq_to_gsi(isa_irq);
 	return 0;
 }
 
@@ -491,7 +538,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
 	}
 #endif
-	irq = plat_gsi;
+	irq = gsi_to_irq(plat_gsi);
 
 	return irq;
 }
@@ -933,6 +980,8 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	mp_irq.dstirq = pin;	/* INTIN# */
 
 	save_mp_irq(&mp_irq);
+
+	isa_irq_to_gsi[bus_irq] = gsi;
 }
 
 void __init mp_config_acpi_legacy_irqs(void)
@@ -1086,7 +1135,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
 			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
 			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	io_apic_set_pci_routing(dev, gsi, &irq_attr);
+	io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
 
 	return gsi;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9f3f6ca86da..594827c3c61 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1037,7 +1037,11 @@ static int pin_2_irq(int idx, int apic, int pin)
                  */
 		if (ioapic_renumber_irq)
 			gsi = ioapic_renumber_irq(apic, gsi);
-		irq = gsi;
+
+		if (gsi >= NR_IRQS_LEGACY)
+			irq = gsi;
+		else
+			irq = gsi_end + 1 + gsi;
 	}
 
 #ifdef CONFIG_X86_32
@@ -3852,7 +3856,7 @@ void __init probe_nr_irqs_gsi(void)
 {
 	int nr;
 
-	nr = gsi_end + 1;
+	nr = gsi_end + 1 + NR_IRQS_LEGACY;
 	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
 
-- 
cgit v1.2.3-70-g09d2


From 7b20bd5fb902088579af4e70f7f802b93181a628 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:16 -0700
Subject: x86, irq: Kill io_apic_renumber_irq

Now that the generic irq layer is performing the exact same remapping as
io_apic_renumber_irq we can kill this weird  es7000 specific function.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-15-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h   |  1 -
 arch/x86/kernel/acpi/boot.c      |  5 -----
 arch/x86/kernel/apic/es7000_32.c | 19 -------------------
 arch/x86/kernel/apic/io_apic.c   |  6 ------
 4 files changed, 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 37b0f2bb503..9da192a17f0 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,6 @@ struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
-extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 extern void ioapic_insert_resources(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 325fbbab7f8..9a5ed58f09d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1117,11 +1117,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 
 	ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
 
-#ifdef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
-
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
 		       "%d-%d\n", mp_ioapics[ioapic].apicid,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5..425e53a87fe 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int					es7000_plat;
 
 static unsigned int			base;
 
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
-	if (es7000_plat == ES7000_ZORRO)
-		return gsi;
-
-	if (!base) {
-		int i;
-		for (i = 0; i < nr_ioapics; i++)
-			base += nr_ioapic_registers[i];
-	}
-
-	if (!ioapic && (gsi < 16))
-		gsi += base;
-
-	return gsi;
-}
-
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
 	unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
 		es7000_plat = ES7000_ZORRO;
 	else
 		es7000_plat = ES7000_CLASSIC;
-	ioapic_renumber_irq = es7000_rename_gsi;
 }
 
 /*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 594827c3c61..d174d886654 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1016,7 +1016,6 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
-int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq;
@@ -1032,11 +1031,6 @@ static int pin_2_irq(int idx, int apic, int pin)
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
 		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
-		/*
-                 * For MPS mode, so far only needed by ES7000 platform
-                 */
-		if (ioapic_renumber_irq)
-			gsi = ioapic_renumber_irq(apic, gsi);
 
 		if (gsi >= NR_IRQS_LEGACY)
 			irq = gsi;
-- 
cgit v1.2.3-70-g09d2


From a2a47c6c3d1a7c01da4464b3b7be93b924f874c1 Mon Sep 17 00:00:00 2001
From: Ky Srinivasan <ksrinivasan@novell.com>
Date: Thu, 6 May 2010 12:08:41 -0700
Subject: x86: Detect running on a Microsoft HyperV system

This patch integrates HyperV detection within the framework currently
used by VmWare. With this patch, we can avoid having to replicate the
HyperV detection code in each of the Microsoft HyperV drivers.

Reworked and tweaked by Greg K-H to build properly.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <20100506190841.GA1605@kroah.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Vadim Rozenfeld <vrozenfe@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hyperv.h    |  6 ++++
 arch/x86/include/asm/mshyperv.h  |  7 +++++
 arch/x86/include/asm/processor.h |  3 ++
 arch/x86/kernel/cpu/Makefile     |  2 +-
 arch/x86/kernel/cpu/hypervisor.c | 10 ++++--
 arch/x86/kernel/cpu/mshyperv.c   | 67 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/include/asm/mshyperv.h
 create mode 100644 arch/x86/kernel/cpu/mshyperv.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index e153a2b3889..46040473e12 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -14,6 +14,9 @@
 #define HYPERV_CPUID_ENLIGHTMENT_INFO		0x40000004
 #define HYPERV_CPUID_IMPLEMENT_LIMITS		0x40000005
 
+#define HYPERV_HYPERVISOR_PRESENT_BIT		0x80000000
+#define HYPERV_CPUID_MIN			0x40000005
+
 /*
  * Feature identification. EAX indicates which features are available
  * to the partition based upon the current partition privileges.
@@ -129,6 +132,9 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT		0x40000020
+
 /* Define the virtual APIC registers */
 #define HV_X64_MSR_EOI				0x40000070
 #define HV_X64_MSR_ICR				0x40000071
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 00000000000..6cd8101d134
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,7 @@
+#ifndef ASM_X86__MSHYPER_H
+#define ASM_X86__MSHYPER_H
+
+int ms_hyperv_platform(void);
+void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703..597c041bd12 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -114,6 +114,8 @@ struct cpuinfo_x86 {
 	u16			cpu_index;
 #endif
 	unsigned int		x86_hyper_vendor;
+	/* The layout of this field is hypervisor specific */
+	unsigned int		x86_hyper_features;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
@@ -129,6 +131,7 @@ struct cpuinfo_x86 {
 
 #define X86_HYPER_VENDOR_NONE  0
 #define X86_HYPER_VENDOR_VMWARE 1
+#define X86_HYPER_VENDOR_MSFT	2
 
 /*
  * capabilities of CPUs
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f367..3a785da34b6 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o sched.o
+obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33..de3f4e0ce8e 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -23,6 +23,7 @@
 
 #include <asm/processor.h>
 #include <asm/vmware.h>
+#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
 static inline void __cpuinit
@@ -30,6 +31,8 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 {
 	if (vmware_platform())
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
+	else if (ms_hyperv_platform())
+		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
 	else
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
 }
@@ -37,10 +40,11 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 static inline void __cpuinit
 hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
+	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
 		vmware_set_feature_bits(c);
-		return;
-	}
+	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
+		ms_hyperv_set_feature_bits(c);
+	return;
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 00000000000..2443b61cdb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,67 @@
+/*
+ * HyperV  Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+
+
+int ms_hyperv_platform(void)
+{
+	u32 eax, ebx, ecx, edx;
+	char hyp_signature[13];
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
+		return 0;
+
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
+	*(u32 *)(hyp_signature + 0) = ebx;
+	*(u32 *)(hyp_signature + 4) = ecx;
+	*(u32 *)(hyp_signature + 8) = edx;
+
+	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
+		return 0;
+	return 1;
+}
+
+void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+{
+	u32 eax, ebx, ecx, edx;
+
+	c->x86_hyper_features = 0;
+	/*
+	 * Extract the features, recommendations etc.
+	 * The first 9 bits will be used to track hypervisor features.
+	 * The next 6 bits will be used to track the hypervisor
+	 * recommendations.
+	 */
+	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= (eax & 0x1ff);
+
+	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= ((eax & 0x3f) << 9);
+	printk(KERN_INFO "Detected HyperV with features: %x\n",
+		c->x86_hyper_features);
+}
-- 
cgit v1.2.3-70-g09d2


From 4261e0e0efd9e04b6c69e0773c3cf4d6f337c416 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:10 +0200
Subject: perf, x86: Move perfctr init code to x86_setup_perfctr()

Split __hw_perf_event_init() to configure pmu events other than
perfctrs. Perfctr code is moved to a separate function
x86_setup_perfctr(). This and the following patches refactor the code.

Split in multiple patches for better review.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7de70613e6c..801441a5424 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,6 +426,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
+static int x86_setup_perfctr(struct perf_event *event);
+
 static int x86_pmu_hw_config(struct perf_event *event)
 {
 	/*
@@ -453,9 +455,6 @@ static int x86_pmu_hw_config(struct perf_event *event)
  */
 static int __hw_perf_event_init(struct perf_event *event)
 {
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
 	int err;
 
 	if (!x86_pmu_initialized())
@@ -482,15 +481,24 @@ static int __hw_perf_event_init(struct perf_event *event)
 
 	event->destroy = hw_perf_event_destroy;
 
-	hwc->idx = -1;
-	hwc->last_cpu = -1;
-	hwc->last_tag = ~0ULL;
+	event->hw.idx = -1;
+	event->hw.last_cpu = -1;
+	event->hw.last_tag = ~0ULL;
 
 	/* Processor specifics */
 	err = x86_pmu.hw_config(event);
 	if (err)
 		return err;
 
+	return x86_setup_perfctr(event);
+}
+
+static int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
-- 
cgit v1.2.3-70-g09d2


From c1726f343b3bfc2ee037e191907c632a31903021 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:11 +0200
Subject: perf, x86: Move x86_setup_perfctr()

Move x86_setup_perfctr(), no other changes made.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 120 +++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 61 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 801441a5424..3d3bceb9e83 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,7 +426,65 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
-static int x86_setup_perfctr(struct perf_event *event);
+static int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	} else {
+		/*
+		 * If we have a PMU initialized but no APIC
+		 * interrupts, we cannot sample hardware
+		 * events (user-space has to fall back and
+		 * sample via a hrtimer based software event):
+		 */
+		if (!x86_pmu.apic)
+			return -EOPNOTSUPP;
+	}
+
+	if (attr->type == PERF_TYPE_RAW)
+		return 0;
+
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+
+	/*
+	 * The generic map:
+	 */
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	/*
+	 * Branch tracing:
+	 */
+	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+	    (hwc->sample_period == 1)) {
+		/* BTS is not supported by this architecture. */
+		if (!x86_pmu.bts)
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (!attr->exclude_kernel)
+			return -EOPNOTSUPP;
+	}
+
+	hwc->config |= config;
+
+	return 0;
+}
 
 static int x86_pmu_hw_config(struct perf_event *event)
 {
@@ -493,66 +551,6 @@ static int __hw_perf_event_init(struct perf_event *event)
 	return x86_setup_perfctr(event);
 }
 
-static int x86_setup_perfctr(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	if (!hwc->sample_period) {
-		hwc->sample_period = x86_pmu.max_period;
-		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
-	} else {
-		/*
-		 * If we have a PMU initialized but no APIC
-		 * interrupts, we cannot sample hardware
-		 * events (user-space has to fall back and
-		 * sample via a hrtimer based software event):
-		 */
-		if (!x86_pmu.apic)
-			return -EOPNOTSUPP;
-	}
-
-	if (attr->type == PERF_TYPE_RAW)
-		return 0;
-
-	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, attr);
-
-	if (attr->config >= x86_pmu.max_events)
-		return -EINVAL;
-
-	/*
-	 * The generic map:
-	 */
-	config = x86_pmu.event_map(attr->config);
-
-	if (config == 0)
-		return -ENOENT;
-
-	if (config == -1LL)
-		return -EINVAL;
-
-	/*
-	 * Branch tracing:
-	 */
-	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-	    (hwc->sample_period == 1)) {
-		/* BTS is not supported by this architecture. */
-		if (!x86_pmu.bts)
-			return -EOPNOTSUPP;
-
-		/* BTS is currently only allowed for user-mode. */
-		if (!attr->exclude_kernel)
-			return -EOPNOTSUPP;
-	}
-
-	hwc->config |= config;
-
-	return 0;
-}
-
 static void x86_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-- 
cgit v1.2.3-70-g09d2


From 9d0fcba67e47ff398a6fa86476d4884d472dc98a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:12 +0200
Subject: perf, x86: Call x86_setup_perfctr() from .hw_config()

The perfctr setup calls are in the corresponding .hw_config()
functions now. This makes it possible to introduce config functions
for other pmu events that are not perfctr specific.

Also, all of a sudden the code looks much nicer.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 9 ++-------
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3d3bceb9e83..c2c1e10f7b0 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -505,7 +505,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
-	return 0;
+	return x86_setup_perfctr(event);
 }
 
 /*
@@ -543,12 +543,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	event->hw.last_cpu = -1;
 	event->hw.last_tag = ~0ULL;
 
-	/* Processor specifics */
-	err = x86_pmu.hw_config(event);
-	if (err)
-		return err;
-
-	return x86_setup_perfctr(event);
+	return x86_pmu.hw_config(event);
 }
 
 static void x86_pmu_disable_all(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 15367cce66b..9e002054cb5 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -455,7 +455,7 @@ static int p4_hw_config(struct perf_event *event)
 		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
 		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 
-	return 0;
+	return x86_setup_perfctr(event);
 }
 
 static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-- 
cgit v1.2.3-70-g09d2


From 31fa58af57c41d2912debf62d47d5811062411f1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:14 +0200
Subject: perf, x86: Pass enable bit mask to __x86_pmu_enable_event()

To reuse this function for events with different enable bit masks,
this mask is part of the function's argument list now.

The function will be used later to control ibs events too.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 9 +++++----
 arch/x86/kernel/cpu/perf_event_intel.c | 5 +++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c2c1e10f7b0..4e218d7ac85 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -844,10 +844,10 @@ void hw_perf_enable(void)
 	x86_pmu.enable_all(added);
 }
 
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+					  u64 enable_mask)
 {
-	wrmsrl(hwc->config_base + hwc->idx,
-			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
+	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
 }
 
 static inline void x86_pmu_disable_event(struct perf_event *event)
@@ -919,7 +919,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	if (cpuc->enabled)
-		__x86_pmu_enable_event(&event->hw);
+		__x86_pmu_enable_event(&event->hw,
+				       ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a099df96f91..a4b56ac425c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -513,7 +513,8 @@ static void intel_pmu_nhm_enable_all(int added)
 			if (!event)
 				continue;
 
-			__x86_pmu_enable_event(&event->hw);
+			__x86_pmu_enable_event(&event->hw,
+					       ARCH_PERFMON_EVENTSEL_ENABLE);
 		}
 	}
 	intel_pmu_enable_all(added);
@@ -617,7 +618,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	if (unlikely(event->attr.precise))
 		intel_pmu_pebs_enable(event);
 
-	__x86_pmu_enable_event(hwc);
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From a1f2b70a942b8d858a0ab02567da3999b60a99b2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:15 +0200
Subject: perf, x86: Use weight instead of cmask in for_each_event_constraint()

There may exist constraints with a cmask set to zero. In this case
for_each_event_constraint() will not work properly. Now weight is used
instead of the cmask for loop exit detection. Weight is always a value
other than zero since the default contains the HWEIGHT from the
counter mask and in other cases a value of zero does not fit too.

This is in preparation of ibs event constraints that wont have a
cmask.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4e218d7ac85..4a3f1f2b9b9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -170,7 +170,7 @@ struct cpu_hw_events {
 	EVENT_CONSTRAINT(0, 0, 0)
 
 #define for_each_event_constraint(e, c)	\
-	for ((e) = (c); (e)->cmask; (e)++)
+	for ((e) = (c); (e)->weight; (e)++)
 
 union perf_capabilities {
 	struct {
-- 
cgit v1.2.3-70-g09d2


From 1e9a6d8d44cb6dcd2799b36ceb23007e6a423bfe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 4 May 2010 16:30:21 +0200
Subject: perf, x86: Remove PEBS SAMPLE_RAW support

Its broken, we really should get PERF_SAMPLE_REGS sorted.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ec8b2e12e10..080b9b065bd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -459,7 +459,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
 	struct perf_sample_data data;
-	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int n;
 
@@ -499,12 +498,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.size = x86_pmu.pebs_record_size;
-		raw.data = at;
-		data.raw = &raw;
-	}
-
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
@@ -536,7 +529,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct pebs_record_nhm *at, *top;
 	struct perf_sample_data data;
 	struct perf_event *event = NULL;
-	struct perf_raw_record raw;
 	struct pt_regs regs;
 	u64 status = 0;
 	int bit, n;
@@ -585,12 +577,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
-		if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-			raw.size = x86_pmu.pebs_record_size;
-			raw.data = at;
-			data.raw = &raw;
-		}
-
 		/*
 		 * See the comment in intel_pmu_drain_pebs_core()
 		 */
-- 
cgit v1.2.3-70-g09d2


From 2b0b5c6fe9b383f3cf35a0a6371c9d577bd523ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Apr 2010 23:03:20 +0200
Subject: perf, x86: Consolidate some code repetition

Remove some duplicated logic.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 97 ++++++++++++++-----------------
 1 file changed, 44 insertions(+), 53 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 080b9b065bd..35056f715e9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -452,14 +452,54 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
 static int intel_pmu_save_and_restart(struct perf_event *event);
 
+static void __intel_pmu_pebs_event(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs)
+{
+	/*
+	 * We cast to pebs_record_core since that is a subset of
+	 * both formats and we don't use the other fields in this
+	 * routine.
+	 */
+	struct pebs_record_core *pebs = __pebs;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!intel_pmu_save_and_restart(event))
+		return;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	regs = *iregs;
+	regs.ip = pebs->ip;
+	regs.bp = pebs->bp;
+	regs.sp = pebs->sp;
+
+	if (intel_pmu_pebs_fixup_ip(regs))
+		regs.flags |= PERF_EFLAGS_EXACT;
+	else
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+
+	if (perf_event_overflow(event, 1, &data, &regs))
+		x86_pmu_stop(event);
+}
+
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
-	struct perf_sample_data data;
-	struct pt_regs regs;
 	int n;
 
 	if (!ds || !x86_pmu.pebs)
@@ -485,9 +525,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (n <= 0)
 		return;
 
-	if (!intel_pmu_save_and_restart(event))
-		return;
-
 	/*
 	 * Should not happen, we program the threshold at 1 and do not
 	 * set a reset value.
@@ -495,31 +532,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	WARN_ON_ONCE(n > 1);
 	at += n - 1;
 
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
-
-	/*
-	 * We use the interrupt regs as a base because the PEBS record
-	 * does not contain a full regs set, specifically it seems to
-	 * lack segment descriptors, which get used by things like
-	 * user_mode().
-	 *
-	 * In the simple case fix up only the IP and BP,SP regs, for
-	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-	 */
-	regs = *iregs;
-	regs.ip = at->ip;
-	regs.bp = at->bp;
-	regs.sp = at->sp;
-
-	if (intel_pmu_pebs_fixup_ip(&regs))
-		regs.flags |= PERF_EFLAGS_EXACT;
-	else
-		regs.flags &= ~PERF_EFLAGS_EXACT;
-
-	if (perf_event_overflow(event, 1, &data, &regs))
-		x86_pmu_stop(event);
+	__intel_pmu_pebs_event(event, iregs, at);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@ -527,9 +540,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
 	struct pebs_record_nhm *at, *top;
-	struct perf_sample_data data;
 	struct perf_event *event = NULL;
-	struct pt_regs regs;
 	u64 status = 0;
 	int bit, n;
 
@@ -571,27 +582,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		if (!event || bit >= MAX_PEBS_EVENTS)
 			continue;
 
-		if (!intel_pmu_save_and_restart(event))
-			continue;
-
-		perf_sample_data_init(&data, 0);
-		data.period = event->hw.last_period;
-
-		/*
-		 * See the comment in intel_pmu_drain_pebs_core()
-		 */
-		regs = *iregs;
-		regs.ip = at->ip;
-		regs.bp = at->bp;
-		regs.sp = at->sp;
-
-		if (intel_pmu_pebs_fixup_ip(&regs))
-			regs.flags |= PERF_EFLAGS_EXACT;
-		else
-			regs.flags &= ~PERF_EFLAGS_EXACT;
-
-		if (perf_event_overflow(event, 1, &data, &regs))
-			x86_pmu_stop(event);
+		__intel_pmu_pebs_event(event, iregs, at);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From ab608344bcbde4f55ec4cd911b686b0ce3eae076 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Apr 2010 23:03:20 +0200
Subject: perf, x86: Improve the PEBS ABI

Rename perf_event_attr::precise to perf_event_attr::precise_ip and
widen it to 2 bits. This new field describes the required precision of
the PERF_SAMPLE_IP field:

  0 - SAMPLE_IP can have arbitrary skid
  1 - SAMPLE_IP must have constant skid
  2 - SAMPLE_IP requested to have 0 skid
  3 - SAMPLE_IP must have 0 skid

And modify the Intel PEBS code accordingly. The PEBS implementation
now supports up to precise_ip == 2, where we perform the IP fixup.

Also s/PERF_RECORD_MISC_EXACT/&_IP/ to clarify its meaning, this bit
should be set for each PERF_SAMPLE_IP field known to match the actual
instruction triggering the event.

This new scheme allows for a PEBS mode that uses the buffer for more
than a single event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 17 ++++++++++++++++-
 arch/x86/kernel/cpu/perf_event_intel.c    |  4 ++--
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 12 ++++++------
 include/linux/perf_event.h                | 23 +++++++++++++++++++----
 tools/perf/builtin-top.c                  |  2 +-
 tools/perf/util/parse-events.c            | 25 ++++++++++++++++---------
 6 files changed, 60 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4a3f1f2b9b9..27fa9eeed02 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -488,6 +488,21 @@ static int x86_setup_perfctr(struct perf_event *event)
 
 static int x86_pmu_hw_config(struct perf_event *event)
 {
+	if (event->attr.precise_ip) {
+		int precise = 0;
+
+		/* Support for constant skid */
+		if (x86_pmu.pebs)
+			precise++;
+
+		/* Support for IP fixup */
+		if (x86_pmu.lbr_nr)
+			precise++;
+
+		if (event->attr.precise_ip > precise)
+			return -EOPNOTSUPP;
+	}
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -1780,7 +1795,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
 	}
 
 	if (regs->flags & PERF_EFLAGS_EXACT)
-		misc |= PERF_RECORD_MISC_EXACT;
+		misc |= PERF_RECORD_MISC_EXACT_IP;
 
 	return misc;
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a4b56ac425c..fdbc652d3fe 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -563,7 +563,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 
 	x86_pmu_disable_event(event);
 
-	if (unlikely(event->attr.precise))
+	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_disable(event);
 }
 
@@ -615,7 +615,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		return;
 	}
 
-	if (unlikely(event->attr.precise))
+	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_enable(event);
 
 	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 35056f715e9..18018d1311c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -307,7 +307,7 @@ intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
 
-	if (!event->attr.precise)
+	if (!event->attr.precise_ip)
 		return NULL;
 
 	if (x86_pmu.pebs_constraints) {
@@ -330,7 +330,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 	WARN_ON_ONCE(cpuc->enabled);
 
-	if (x86_pmu.intel_cap.pebs_trap)
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
 		intel_pmu_lbr_enable(event);
 }
 
@@ -345,7 +345,7 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-	if (x86_pmu.intel_cap.pebs_trap)
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
 		intel_pmu_lbr_disable(event);
 }
 
@@ -485,7 +485,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	regs.bp = pebs->bp;
 	regs.sp = pebs->sp;
 
-	if (intel_pmu_pebs_fixup_ip(regs))
+	if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
 		regs.flags |= PERF_EFLAGS_EXACT;
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
@@ -518,7 +518,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 
 	WARN_ON_ONCE(!event);
 
-	if (!event->attr.precise)
+	if (!event->attr.precise_ip)
 		return;
 
 	n = top - at;
@@ -570,7 +570,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 			WARN_ON_ONCE(!event);
 
-			if (!event->attr.precise)
+			if (!event->attr.precise_ip)
 				continue;
 
 			if (__test_and_set_bit(bit, (unsigned long *)&status))
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6be4a0f9137..23cd0057a68 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -203,9 +203,19 @@ struct perf_event_attr {
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
 				watermark      :  1, /* wakeup_watermark      */
-				precise        :  1, /* OoO invariant counter */
-
-				__reserved_1   : 48;
+				/*
+				 * precise_ip:
+				 *
+				 *  0 - SAMPLE_IP can have arbitrary skid
+				 *  1 - SAMPLE_IP must have constant skid
+				 *  2 - SAMPLE_IP requested to have 0 skid
+				 *  3 - SAMPLE_IP must have 0 skid
+				 *
+				 *  See also PERF_RECORD_MISC_EXACT_IP
+				 */
+				precise_ip     :  2, /* skid constraint       */
+
+				__reserved_1   : 47;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -296,7 +306,12 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
 #define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
-#define PERF_RECORD_MISC_EXACT			(1 << 14)
+/*
+ * Indicates that the content of PERF_SAMPLE_IP points to
+ * the actual instruction that triggered the event. See also
+ * perf_event_attr::precise_ip.
+ */
+#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3de397764cb..ed9b5b6905f 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1021,7 +1021,7 @@ static void event__process_sample(const event_t *self,
 		return;
 	}
 
-	if (self->header.misc & PERF_RECORD_MISC_EXACT)
+	if (self->header.misc & PERF_RECORD_MISC_EXACT_IP)
 		exact_samples++;
 
 	if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index bc8b7e61420..ae7f5917935 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -654,10 +654,6 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
 		return EVT_FAILED;
 	n = hex2u64(str + 1, &config);
 	if (n > 0) {
-		if (str[n+1] == 'p') {
-			attr->precise = 1;
-			n++;
-		}
 		*strp = str + n + 1;
 		attr->type = PERF_TYPE_RAW;
 		attr->config = config;
@@ -692,19 +688,29 @@ static enum event_result
 parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
-	int eu = 1, ek = 1, eh = 1;
+	int exclude = 0;
+	int eu = 0, ek = 0, eh = 0, precise = 0;
 
 	if (*str++ != ':')
 		return 0;
 	while (*str) {
-		if (*str == 'u')
+		if (*str == 'u') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			eu = 0;
-		else if (*str == 'k')
+		} else if (*str == 'k') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			ek = 0;
-		else if (*str == 'h')
+		} else if (*str == 'h') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			eh = 0;
-		else
+		} else if (*str == 'p') {
+			precise++;
+		} else
 			break;
+
 		++str;
 	}
 	if (str >= *strp + 2) {
@@ -712,6 +718,7 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 		attr->exclude_user   = eu;
 		attr->exclude_kernel = ek;
 		attr->exclude_hv     = eh;
+		attr->precise_ip     = precise;
 		return 1;
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 4d1c52b02d977d884abb21d0bbaba6b5d6bc8374 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Fri, 23 Apr 2010 13:56:12 +0800
Subject: perf, x86: implement group scheduling transactional APIs

Convert to the transactional PMU API and remove the duplication of
group_sched_in().

Reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1272002172.5707.61.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 180 +++++++++++++++------------------------
 1 file changed, 67 insertions(+), 113 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 27fa9eeed02..fd4db0db370 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -110,6 +110,8 @@ struct cpu_hw_events {
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
 
+	unsigned int		group_flag;
+
 	/*
 	 * Intel DebugStore bits
 	 */
@@ -961,6 +963,14 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (n < 0)
 		return n;
 
+	/*
+	 * If group events scheduling transaction was started,
+	 * skip the schedulability test here, it will be peformed
+	 * at commit time(->commit_txn) as a whole
+	 */
+	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+		goto out;
+
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
 		return ret;
@@ -970,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
+out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
 
@@ -1227,119 +1238,6 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return &unconstrained;
 }
 
-static int x86_event_sched_in(struct perf_event *event,
-			  struct perf_cpu_context *cpuctx)
-{
-	int ret = 0;
-
-	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = smp_processor_id();
-	event->tstamp_running += event->ctx->time - event->tstamp_stopped;
-
-	if (!is_x86_event(event))
-		ret = event->pmu->enable(event);
-
-	if (!ret && !is_software_event(event))
-		cpuctx->active_oncpu++;
-
-	if (!ret && event->attr.exclusive)
-		cpuctx->exclusive = 1;
-
-	return ret;
-}
-
-static void x86_event_sched_out(struct perf_event *event,
-			    struct perf_cpu_context *cpuctx)
-{
-	event->state = PERF_EVENT_STATE_INACTIVE;
-	event->oncpu = -1;
-
-	if (!is_x86_event(event))
-		event->pmu->disable(event);
-
-	event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
-
-	if (!is_software_event(event))
-		cpuctx->active_oncpu--;
-
-	if (event->attr.exclusive || !cpuctx->active_oncpu)
-		cpuctx->exclusive = 0;
-}
-
-/*
- * Called to enable a whole group of events.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- *
- * called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
- */
-int hw_perf_group_sched_in(struct perf_event *leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct perf_event *sub;
-	int assign[X86_PMC_IDX_MAX];
-	int n0, n1, ret;
-
-	if (!x86_pmu_initialized())
-		return 0;
-
-	/* n0 = total number of events */
-	n0 = collect_events(cpuc, leader, true);
-	if (n0 < 0)
-		return n0;
-
-	ret = x86_pmu.schedule_events(cpuc, n0, assign);
-	if (ret)
-		return ret;
-
-	ret = x86_event_sched_in(leader, cpuctx);
-	if (ret)
-		return ret;
-
-	n1 = 1;
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		if (sub->state > PERF_EVENT_STATE_OFF) {
-			ret = x86_event_sched_in(sub, cpuctx);
-			if (ret)
-				goto undo;
-			++n1;
-		}
-	}
-	/*
-	 * copy new assignment, now we know it is possible
-	 * will be used by hw_perf_enable()
-	 */
-	memcpy(cpuc->assign, assign, n0*sizeof(int));
-
-	cpuc->n_events  = n0;
-	cpuc->n_added  += n1;
-	ctx->nr_active += n1;
-
-	/*
-	 * 1 means successful and events are active
-	 * This is not quite true because we defer
-	 * actual activation until hw_perf_enable() but
-	 * this way we* ensure caller won't try to enable
-	 * individual events
-	 */
-	return 1;
-undo:
-	x86_event_sched_out(leader, cpuctx);
-	n0  = 1;
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
-			x86_event_sched_out(sub, cpuctx);
-			if (++n0 == n1)
-				break;
-		}
-	}
-	return ret;
-}
-
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
 #include "perf_event_p4.c"
@@ -1471,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
 	x86_perf_event_update(event);
 }
 
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ */
+static void x86_pmu_start_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ */
+static int x86_pmu_commit_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int assign[X86_PMC_IDX_MAX];
+	int n, ret;
+
+	n = cpuc->n_events;
+
+	if (!x86_pmu_initialized())
+		return -EAGAIN;
+
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	if (ret)
+		return ret;
+
+	/*
+	 * copy new assignment, now we know it is possible
+	 * will be used by hw_perf_enable()
+	 */
+	memcpy(cpuc->assign, assign, n*sizeof(int));
+
+	return 0;
+}
+
 static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
@@ -1478,6 +1429,9 @@ static const struct pmu pmu = {
 	.stop		= x86_pmu_stop,
 	.read		= x86_pmu_read,
 	.unthrottle	= x86_pmu_unthrottle,
+	.start_txn	= x86_pmu_start_txn,
+	.cancel_txn	= x86_pmu_cancel_txn,
+	.commit_txn	= x86_pmu_commit_txn,
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 2b107d93635616db0c3f893c8cc2e6d5cd8d77b2 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 7 May 2010 14:59:45 -0700
Subject: x86: Avoid check hlt for newer cpus

Check hlt instruction was targeted for some older CPUs. It is an expensive
operation in that it takes 4 ticks to break out the check.  We can avoid
such check completely for newer x86 cpus (family >= 5).

[ hpa: corrected family > 5 to family >= 5 ]

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273269585-14346-1-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a26521239..c39576cb301 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
 
 static void __init check_hlt(void)
 {
-	if (paravirt_enabled())
+	if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
 		return;
 
 	printk(KERN_INFO "Checking 'hlt' instruction... ");
-- 
cgit v1.2.3-70-g09d2


From 9fa02317429449e8176c9bb6da3ac00eb14d52d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 7 May 2010 16:55:41 -0700
Subject: x86, HyperV: fix up the license to mshyperv.c

This should have been GPLv2 only, we cut and pasted from the wrong file
originally, sorry.

Also removed some unneeded boilerplate license code, we all know where
to find the GPLv2, and that there's no warranty as that is implicit from
the license.

Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
LKML-Reference: <20100507235541.GA15448@kroah.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mshyperv.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 2443b61cdb1..a58d8e64fc7 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -6,18 +6,7 @@
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ * the Free Software Foundation; version 2 of the License.
  *
  */
 
-- 
cgit v1.2.3-70-g09d2


From e08cae4181af9483b04ecfac48f01c8e5a5f27bf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 7 May 2010 16:57:28 -0700
Subject: x86: Clean up the hypervisor layer

Clean up the hypervisor layer and the hypervisor drivers, using an ops
structure instead of an enumeration with if statements.

The identity of the hypervisor, if needed, can be tested by testing
the pointer value in x86_hyper.

The MS-HyperV private state is moved into a normal global variable
(it's per-system state, not per-CPU state).  Being a normal bss
variable, it will be left at all zero on non-HyperV platforms, and so
can generally be tested for HyperV-specific features without
additional qualification.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/include/asm/hyperv.h     |  5 ++--
 arch/x86/include/asm/hypervisor.h | 27 +++++++++++++++++--
 arch/x86/include/asm/mshyperv.h   | 15 ++++++++---
 arch/x86/include/asm/processor.h  |  7 -----
 arch/x86/include/asm/vmware.h     | 27 -------------------
 arch/x86/kernel/cpu/hypervisor.c  | 56 ++++++++++++++++++++++++---------------
 arch/x86/kernel/cpu/mshyperv.c    | 51 +++++++++++++++++------------------
 arch/x86/kernel/cpu/vmware.c      | 36 ++++++++++++++-----------
 8 files changed, 117 insertions(+), 107 deletions(-)
 delete mode 100644 arch/x86/include/asm/vmware.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index 46040473e12..5df477ac3af 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_KVM_HYPERV_H
-#define _ASM_X86_KVM_HYPERV_H
+#ifndef _ASM_X86_HYPERV_H
+#define _ASM_X86_HYPERV_H
 
 #include <linux/types.h>
 
@@ -16,6 +16,7 @@
 
 #define HYPERV_HYPERVISOR_PRESENT_BIT		0x80000000
 #define HYPERV_CPUID_MIN			0x40000005
+#define HYPERV_CPUID_MAX			0x4000ffff
 
 /*
  * Feature identification. EAX indicates which features are available
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index b78c0941e42..70abda7058c 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -17,10 +17,33 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  */
-#ifndef ASM_X86__HYPERVISOR_H
-#define ASM_X86__HYPERVISOR_H
+#ifndef _ASM_X86_HYPERVISOR_H
+#define _ASM_X86_HYPERVISOR_H
 
 extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
 
+/*
+ * x86 hypervisor information
+ */
+struct hypervisor_x86 {
+	/* Hypervisor name */
+	const char	*name;
+
+	/* Detection routine */
+	bool		(*detect)(void);
+
+	/* Adjust CPU feature bits (run once per CPU) */
+	void		(*set_cpu_features)(struct cpuinfo_x86 *);
+
+	/* Platform setup (run once per boot) */
+	void		(*init_platform)(void);
+};
+
+extern const struct hypervisor_x86 *x86_hyper;
+
+/* Recognized hypervisors */
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+
 #endif
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 6cd8101d134..79ce5685ab6 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -1,7 +1,14 @@
-#ifndef ASM_X86__MSHYPER_H
-#define ASM_X86__MSHYPER_H
+#ifndef _ASM_X86_MSHYPER_H
+#define _ASM_X86_MSHYPER_H
 
-int ms_hyperv_platform(void);
-void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c);
+#include <linux/types.h>
+#include <asm/hyperv.h>
+
+struct ms_hyperv_info {
+	u32 features;
+	u32 hints;
+};
+
+extern struct ms_hyperv_info ms_hyperv;
 
 #endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 597c041bd12..e4f1dfb2d05 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -113,9 +113,6 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
-	unsigned int		x86_hyper_vendor;
-	/* The layout of this field is hypervisor specific */
-	unsigned int		x86_hyper_features;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
@@ -129,10 +126,6 @@ struct cpuinfo_x86 {
 
 #define X86_VENDOR_UNKNOWN	0xff
 
-#define X86_HYPER_VENDOR_NONE  0
-#define X86_HYPER_VENDOR_VMWARE 1
-#define X86_HYPER_VENDOR_MSFT	2
-
 /*
  * capabilities of CPUs
  */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
deleted file mode 100644
index e49ed6d2fd4..00000000000
--- a/arch/x86/include/asm/vmware.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2008, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#ifndef ASM_X86__VMWARE_H
-#define ASM_X86__VMWARE_H
-
-extern void vmware_platform_setup(void);
-extern int vmware_platform(void);
-extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
-
-#endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index de3f4e0ce8e..87381759d3c 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -22,40 +22,52 @@
  */
 
 #include <asm/processor.h>
-#include <asm/vmware.h>
-#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
-static inline void __cpuinit
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+/*
+ * Hypervisor detect order.  This is specified explicitly here because
+ * some hypervisors might implement compatibility modes for other
+ * hypervisors and therefore need to be detected in specific sequence.
+ */
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-	if (vmware_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
-	else if (ms_hyperv_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
-	else
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
-}
+	&x86_hyper_vmware,
+	&x86_hyper_ms_hyperv,
+};
 
-static inline void __cpuinit
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+const struct hypervisor_x86 *x86_hyper;
+
+static inline void __init
+detect_hypervisor_vendor(void)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_set_feature_bits(c);
-	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
-		ms_hyperv_set_feature_bits(c);
-	return;
+	const struct hypervisor_x86 *h, * const *p;
+
+	for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+		h = *p;
+		if (h->detect()) {
+			x86_hyper = h;
+			printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
+			break;
+		}
+	}
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
 {
-	detect_hypervisor_vendor(c);
-	hypervisor_set_feature_bits(c);
+	if (x86_hyper && x86_hyper->set_cpu_features)
+		x86_hyper->set_cpu_features(c);
 }
 
 void __init init_hypervisor_platform(void)
 {
+
+	detect_hypervisor_vendor();
+
+	if (!x86_hyper)
+		return;
+
 	init_hypervisor(&boot_cpu_data);
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_platform_setup();
+
+	if (x86_hyper->init_platform)
+		x86_hyper->init_platform();
 }
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index a58d8e64fc7..5969c3ee318 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -12,45 +12,42 @@
 
 #include <linux/types.h>
 #include <asm/processor.h>
+#include <asm/hypervisor.h>
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
 
+struct ms_hyperv_info ms_hyperv;
 
-int ms_hyperv_platform(void)
+static bool __init ms_hyperv_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-	char hyp_signature[13];
+	u32 eax;
+	u32 hyp_signature[3];
 
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
-		return 0;
+	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
 
-	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
-	*(u32 *)(hyp_signature + 0) = ebx;
-	*(u32 *)(hyp_signature + 4) = ecx;
-	*(u32 *)(hyp_signature + 8) = edx;
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+	      &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
 
-	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
-		return 0;
-	return 1;
+	return eax >= HYPERV_CPUID_MIN &&
+		eax <= HYPERV_CPUID_MAX &&
+		!memcmp("Microsoft Hv", hyp_signature, 12);
 }
 
-void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+static void __init ms_hyperv_init_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-
-	c->x86_hyper_features = 0;
 	/*
-	 * Extract the features, recommendations etc.
-	 * The first 9 bits will be used to track hypervisor features.
-	 * The next 6 bits will be used to track the hypervisor
-	 * recommendations.
+	 * Extract the features and hints
 	 */
-	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= (eax & 0x1ff);
+	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= ((eax & 0x3f) << 9);
-	printk(KERN_INFO "Detected HyperV with features: %x\n",
-		c->x86_hyper_features);
+	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
+	       ms_hyperv.features, ms_hyperv.hints);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+	.name			= "Microsoft HyperV",
+	.detect			= ms_hyperv_platform,
+	.init_platform		= ms_hyperv_init_platform,
+};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59c..46a5b5d3ba5 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -23,8 +23,8 @@
 
 #include <linux/dmi.h>
 #include <asm/div64.h>
-#include <asm/vmware.h>
 #include <asm/x86_init.h>
+#include <asm/hypervisor.h>
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -64,7 +64,7 @@ static unsigned long vmware_get_tsc_khz(void)
 	return tsc_hz;
 }
 
-void __init vmware_platform_setup(void)
+static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
 
@@ -82,24 +82,21 @@ void __init vmware_platform_setup(void)
  * serial key should be enough, as this will always have a VMware
  * specific string when running under VMware hypervisor.
  */
-int vmware_platform(void)
+static bool __init vmware_platform(void)
 {
 	if (cpu_has_hypervisor) {
-		unsigned int eax, ebx, ecx, edx;
-		char hyper_vendor_id[13];
-
-		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
-		memcpy(hyper_vendor_id + 0, &ebx, 4);
-		memcpy(hyper_vendor_id + 4, &ecx, 4);
-		memcpy(hyper_vendor_id + 8, &edx, 4);
-		hyper_vendor_id[12] = '\0';
-		if (!strcmp(hyper_vendor_id, "VMwareVMware"))
-			return 1;
+		unsigned int eax;
+		unsigned int hyper_vendor_id[3];
+
+		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+		      &hyper_vendor_id[1], &hyper_vendor_id[2]);
+		if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+			return true;
 	} else if (dmi_available && dmi_name_in_serial("VMware") &&
 		   __vmware_platform())
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 /*
@@ -114,8 +111,15 @@ int vmware_platform(void)
  * so that the kernel could just trust the hypervisor with providing a
  * reliable virtual TSC that is suitable for timekeeping.
  */
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
 {
 	set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+	.name			= "VMware",
+	.detect			= vmware_platform,
+	.set_cpu_features	= vmware_set_cpu_features,
+	.init_platform		= vmware_platform_setup,
+};
-- 
cgit v1.2.3-70-g09d2


From de902d967feb96f2dfddfbe9dbd69dc22fd5ebcb Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:39:52 +0400
Subject: x86, perf: P4 PMU -- configure predefined events

If an event is not RAW we should not exit p4_hw_config
early but call x86_setup_perfctr as well.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 9e002054cb5..b1f532d1d36 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -439,21 +439,20 @@ static int p4_hw_config(struct perf_event *event)
 	if (p4_ht_active() && p4_ht_thread(cpu))
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
-	if (event->attr.type != PERF_TYPE_RAW)
-		return 0;
-
-	/*
-	 * We don't control raw events so it's up to the caller
-	 * to pass sane values (and we don't count the thread number
-	 * on HT machine but allow HT-compatible specifics to be
-	 * passed on)
-	 *
-	 * XXX: HT wide things should check perf_paranoid_cpu() &&
-	 *      CAP_SYS_ADMIN
-	 */
-	event->hw.config |= event->attr.config &
-		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+	if (event->attr.type == PERF_TYPE_RAW) {
+		/*
+		 * We don't control raw events so it's up to the caller
+		 * to pass sane values (and we don't count the thread number
+		 * on HT machine but allow HT-compatible specifics to be
+		 * passed on)
+		 *
+		 * XXX: HT wide things should check perf_paranoid_cpu() &&
+		 *      CAP_SYS_ADMIN
+		 */
+		event->hw.config |= event->attr.config &
+			(p4_config_pack_escr(P4_ESCR_MASK_HT) |
+			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+	}
 
 	return x86_setup_perfctr(event);
 }
-- 
cgit v1.2.3-70-g09d2


From 137351e0feeb9f25d99488ee1afc1c79f5499a9a Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:25:52 +0400
Subject: x86, perf: P4 PMU -- protect sensible procedures from preemption

Steven reported:

|
| I'm getting:
|
| Pid: 3477, comm: perf Not tainted 2.6.34-rc6 #2727
| Call Trace:
|  [<ffffffff811c7565>] debug_smp_processor_id+0xd5/0xf0
|  [<ffffffff81019874>] p4_hw_config+0x2b/0x15c
|  [<ffffffff8107acbc>] ? trace_hardirqs_on_caller+0x12b/0x14f
|  [<ffffffff81019143>] hw_perf_event_init+0x468/0x7be
|  [<ffffffff810782fd>] ? debug_mutex_init+0x31/0x3c
|  [<ffffffff810c68b2>] T.850+0x273/0x42e
|  [<ffffffff810c6cab>] sys_perf_event_open+0x23e/0x3f1
|  [<ffffffff81009e6a>] ? sysret_check+0x2e/0x69
|  [<ffffffff81009e32>] system_call_fastpath+0x16/0x1b
|
| When running perf record in latest tip/perf/core
|

Due to the fact that p4 counters are shared between HT threads
we synthetically divide the whole set of counters into two
non-intersected subsets. And while we're "borrowing" counters
from these subsets we should not be preempted (well, strictly
speaking in p4_hw_config we just pre-set reference to the
subset which allow to save some cycles in schedule routine
if it happens on the same cpu). So use get_cpu/put_cpu pair.

Also p4_pmu_schedule_events should use smp_processor_id rather
than raw_ version. This allow us to catch up preemption issue
(if there will ever be).

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112716.963478928@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b1f532d1d36..ca40180c41d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -421,7 +421,8 @@ static u64 p4_pmu_event_map(int hw_event)
 
 static int p4_hw_config(struct perf_event *event)
 {
-	int cpu = raw_smp_processor_id();
+	int cpu = get_cpu();
+	int rc = 0;
 	u32 escr, cccr;
 
 	/*
@@ -454,7 +455,10 @@ static int p4_hw_config(struct perf_event *event)
 			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 	}
 
-	return x86_setup_perfctr(event);
+	rc = x86_setup_perfctr(event);
+	put_cpu();
+
+	return rc;
 }
 
 static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-- 
cgit v1.2.3-70-g09d2


From 3f51b7119d052827dbb0e40c966acdf2bdc6f47f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:25:53 +0400
Subject: x86, perf: P4 PMU -- Get rid of redundant check for array index

The caller already has done such a check.
And it was wrong anyway, it had to be '>=' rather than '>'

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112717.130386882@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ca40180c41d..b8c2d379eea 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -406,11 +406,6 @@ static u64 p4_pmu_event_map(int hw_event)
 	unsigned int esel;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_general_events)) {
-		printk_once(KERN_ERR "P4 PMU: Bad index: %i\n", hw_event);
-		return 0;
-	}
-
 	config = p4_general_events[hw_event];
 	bind = p4_config_get_bind(config);
 	esel = P4_OPCODE_ESEL(bind->opcode);
-- 
cgit v1.2.3-70-g09d2


From c7993165ef0c1d636ca05f4787739f8414584e6d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:25:54 +0400
Subject: x86, perf: P4 PMU -- check for proper event index in RAW events

RAW events are special and we should be ready for user passing
in insane event index values.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112717.315897547@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b8c2d379eea..a603930271f 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -418,6 +418,7 @@ static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = get_cpu();
 	int rc = 0;
+	unsigned int evnt;
 	u32 escr, cccr;
 
 	/*
@@ -436,6 +437,14 @@ static int p4_hw_config(struct perf_event *event)
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
 	if (event->attr.type == PERF_TYPE_RAW) {
+
+		/* user data may have out-of-bound event index */
+		evnt = p4_config_unpack_event(event->attr.config);
+		if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
 		/*
 		 * We don't control raw events so it's up to the caller
 		 * to pass sane values (and we don't count the thread number
@@ -451,8 +460,8 @@ static int p4_hw_config(struct perf_event *event)
 	}
 
 	rc = x86_setup_perfctr(event);
+out:
 	put_cpu();
-
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 96f6e775b58687d85ee33004d414419b5ec34106 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 9 May 2010 01:10:34 -0700
Subject: x86, hypervisor: Export the x86_hyper* symbols

Export x86_hyper and the related specific structures, allowing for
hypervisor identification by modules.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 arch/x86/kernel/cpu/vmware.c     | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 87381759d3c..4afb5a2130e 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -36,6 +36,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 };
 
 const struct hypervisor_x86 *x86_hyper;
+EXPORT_SYMBOL(x86_hyper);
 
 static inline void __init
 detect_hypervisor_vendor(void)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5969c3ee318..0f1371724c8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -51,3 +51,4 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
 	.detect			= ms_hyperv_platform,
 	.init_platform		= ms_hyperv_init_platform,
 };
+EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 265b432f6e6..b9d1ff58844 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -99,7 +99,6 @@ static bool __init vmware_platform(void)
 
 	return false;
 }
-EXPORT_SYMBOL(vmware_platform);
 
 /*
  * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -125,3 +124,4 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = {
 	.set_cpu_features	= vmware_set_cpu_features,
 	.init_platform		= vmware_platform_setup,
 };
+EXPORT_SYMBOL(x86_hyper_vmware);
-- 
cgit v1.2.3-70-g09d2


From 3998d095354d2a3062bdaa821ef07a1e1c82873c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 9 May 2010 22:46:54 -0700
Subject: x86, hypervisor: add missing <linux/module.h>

EXPORT_SYMBOL() needs <linux/module.h> to be included; fixes modular
builds of the VMware balloon driver, and any future modular drivers
which depends on the hypervisor.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 4afb5a2130e..dd531cc56a8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,6 +21,7 @@
  *
  */
 
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0f1371724c8..16f41bbe46b 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
-- 
cgit v1.2.3-70-g09d2


From c9ad488289144ae5ef53b012e15895ef1f5e4bb6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 6 May 2010 11:45:45 +0300
Subject: x86: Eliminate TS_XSAVE

The fpu code currently uses current->thread_info->status & TS_XSAVE as
a way to distinguish between XSAVE capable processors and older processors.
The decision is not really task specific; instead we use the task status to
avoid a global memory reference - the value should be the same across all
threads.

Eliminate this tie-in into the task structure by using an alternative
instruction keyed off the XSAVE cpu feature; this results in shorter and
faster code, without introducing a global memory reference.

[ hpa: in the future, this probably should use an asm jmp ]

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/i387.h        | 20 ++++++++++++++++----
 arch/x86/include/asm/thread_info.h |  1 -
 arch/x86/kernel/cpu/common.c       |  5 +----
 arch/x86/kernel/i387.c             |  5 +----
 arch/x86/kernel/xsave.c            |  6 +++---
 5 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index da293092450..a301a6825c3 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -56,6 +56,18 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
+static inline bool use_xsave(void)
+{
+	u8 has_xsave;
+
+	alternative_io("mov $0, %0",
+		       "mov $1, %0",
+		       X86_FEATURE_XSAVE,
+		       "=g"(has_xsave));
+
+	return has_xsave;
+}
+
 #ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
@@ -99,7 +111,7 @@ static inline void clear_fpu_state(struct task_struct *tsk)
 	/*
 	 * xsave header may indicate the init state of the FP.
 	 */
-	if ((task_thread_info(tsk)->status & TS_XSAVE) &&
+	if (use_xsave() &&
 	    !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
 		return;
 
@@ -164,7 +176,7 @@ static inline void fxsave(struct task_struct *tsk)
 
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		xsave(tsk);
 	else
 		fxsave(tsk);
@@ -218,7 +230,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
  */
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
+	if (use_xsave()) {
 		struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
 		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
 
@@ -266,7 +278,7 @@ end:
 
 static inline int restore_fpu_checking(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		return xrstor_checking(&tsk->thread.xstate->xsave);
 	else
 		return fxrstor_checking(&tsk->thread.xstate->fxsave);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e96..e9e341505ab 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -244,7 +244,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
-#define TS_XSAVE		0x0010	/* Use xsave/xrstor */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951e..c1c00d0b169 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1243,10 +1243,7 @@ void __cpuinit cpu_init(void)
 	/*
 	 * Force FPU initialization:
 	 */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c28548..14ca1dc7a70 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,10 +102,7 @@ void __cpuinit fpu_init(void)
 
 	mxcsr_feature_mask_init();
 	/* clean state in init */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 }
 #endif	/* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec..c1b0a11033a 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
 		if (err)
 			return err;
 
-		if (task_thread_info(tsk)->status & TS_XSAVE)
+		if (use_xsave())
 			err = xsave_user(buf);
 		else
 			err = fxsave_user(buf);
@@ -116,7 +116,7 @@ int save_i387_xstate(void __user *buf)
 
 	clear_used_math(); /* trigger finit */
 
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
+	if (use_xsave()) {
 		struct _fpstate __user *fx = buf;
 		struct _xstate __user *x = buf;
 		u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
 		clts();
 		task_thread_info(current)->status |= TS_USEDFPU;
 	}
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		err = restore_user_xstate(buf);
 	else
 		err = fxrstor_checking((__force struct i387_fxsave_struct *)
-- 
cgit v1.2.3-70-g09d2


From 86603283326c9e95e5ad4e9fdddeec93cac5d9ad Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 6 May 2010 11:45:46 +0300
Subject: x86: Introduce 'struct fpu' and related API

Currently all fpu state access is through tsk->thread.xstate.  Since we wish
to generalize fpu access to non-task contexts, wrap the state in a new
'struct fpu' and convert existing access to use an fpu API.

Signal frame handlers are not converted to the API since they will remain
task context only things.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-3-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/i387.h      | 115 +++++++++++++++++++++++++++++----------
 arch/x86/include/asm/processor.h |   6 +-
 arch/x86/include/asm/xsave.h     |   7 ++-
 arch/x86/kernel/i387.c           | 102 +++++++++++++++++-----------------
 arch/x86/kernel/process.c        |  21 +++----
 arch/x86/kernel/process_32.c     |   2 +-
 arch/x86/kernel/process_64.c     |   2 +-
 arch/x86/kernel/xsave.c          |   2 +-
 arch/x86/math-emu/fpu_aux.c      |   6 +-
 9 files changed, 160 insertions(+), 103 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a301a6825c3..1a8cca33b73 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -16,6 +16,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/regset.h>
 #include <linux/hardirq.h>
+#include <linux/slab.h>
 #include <asm/asm.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
@@ -103,10 +104,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
    values. The kernel data segment can be sometimes 0 and sometimes
    new user value. Both should be ok.
    Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct task_struct *tsk)
+static inline void fpu_clear(struct fpu *fpu)
 {
-	struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
-	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+	struct xsave_struct *xstate = &fpu->state->xsave;
+	struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
 	/*
 	 * xsave header may indicate the init state of the FP.
@@ -123,6 +124,11 @@ static inline void clear_fpu_state(struct task_struct *tsk)
 			  X86_FEATURE_FXSAVE_LEAK);
 }
 
+static inline void clear_fpu_state(struct task_struct *tsk)
+{
+	fpu_clear(&tsk->thread.fpu);
+}
+
 static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
@@ -147,7 +153,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 	return err;
 }
 
-static inline void fxsave(struct task_struct *tsk)
+static inline void fpu_fxsave(struct fpu *fpu)
 {
 	/* Using "rex64; fxsave %0" is broken because, if the memory operand
 	   uses any extended registers for addressing, a second REX prefix
@@ -157,42 +163,45 @@ static inline void fxsave(struct task_struct *tsk)
 	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
 	   starting with gas 2.16. */
 	__asm__ __volatile__("fxsaveq %0"
-			     : "=m" (tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave));
 #elif 0
 	/* Using, as a workaround, the properly prefixed form below isn't
 	   accepted by any binutils version so far released, complaining that
 	   the same type of prefix is used twice if an extended register is
 	   needed for addressing (fix submitted to mainline 2005-11-21). */
 	__asm__ __volatile__("rex64/fxsave %0"
-			     : "=m" (tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave));
 #else
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
 	__asm__ __volatile__("rex64/fxsave (%1)"
-			     : "=m" (tsk->thread.xstate->fxsave)
-			     : "cdaSDb" (&tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave)
+			     : "cdaSDb" (&fpu->state->fxsave));
 #endif
 }
 
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave())
-		xsave(tsk);
+		fpu_xsave(fpu);
 	else
-		fxsave(tsk);
+		fpu_fxsave(fpu);
 
-	clear_fpu_state(tsk);
+	fpu_clear(fpu);
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	fpu_save_init(&tsk->thread.fpu);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
 #else  /* CONFIG_X86_32 */
 
 #ifdef CONFIG_MATH_EMULATION
-extern void finit_task(struct task_struct *tsk);
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
 #else
-static inline void finit_task(struct task_struct *tsk)
-{
-}
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
 static inline void tolerant_fwait(void)
@@ -228,13 +237,13 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 /*
  * These must be called with preempt disabled
  */
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave()) {
-		struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
-		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+		struct xsave_struct *xstate = &fpu->state->xsave;
+		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
-		xsave(tsk);
+		fpu_xsave(fpu);
 
 		/*
 		 * xsave header may indicate the init state of the FP.
@@ -258,8 +267,8 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 		"fxsave %[fx]\n"
 		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
 		X86_FEATURE_FXSR,
-		[fx] "m" (tsk->thread.xstate->fxsave),
-		[fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
+		[fx] "m" (fpu->state->fxsave),
+		[fsw] "m" (fpu->state->fxsave.swd) : "memory");
 clear_state:
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
@@ -271,17 +280,34 @@ clear_state:
 		X86_FEATURE_FXSAVE_LEAK,
 		[addr] "m" (safe_address));
 end:
+	;
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	fpu_save_init(&tsk->thread.fpu);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
+
 #endif	/* CONFIG_X86_64 */
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
+static inline int fpu_fxrstor_checking(struct fpu *fpu)
+{
+	return fxrstor_checking(&fpu->state->fxsave);
+}
+
+static inline int fpu_restore_checking(struct fpu *fpu)
 {
 	if (use_xsave())
-		return xrstor_checking(&tsk->thread.xstate->xsave);
+		return fpu_xrstor_checking(fpu);
 	else
-		return fxrstor_checking(&tsk->thread.xstate->fxsave);
+		return fpu_fxrstor_checking(fpu);
+}
+
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+	return fpu_restore_checking(&tsk->thread.fpu);
 }
 
 /*
@@ -409,30 +435,59 @@ static inline void clear_fpu(struct task_struct *tsk)
 static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.xstate->fxsave.cwd;
+		return tsk->thread.fpu.state->fxsave.cwd;
 	} else {
-		return (unsigned short)tsk->thread.xstate->fsave.cwd;
+		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
 	}
 }
 
 static inline unsigned short get_fpu_swd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.xstate->fxsave.swd;
+		return tsk->thread.fpu.state->fxsave.swd;
 	} else {
-		return (unsigned short)tsk->thread.xstate->fsave.swd;
+		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
 	}
 }
 
 static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
 {
 	if (cpu_has_xmm) {
-		return tsk->thread.xstate->fxsave.mxcsr;
+		return tsk->thread.fpu.state->fxsave.mxcsr;
 	} else {
 		return MXCSR_DEFAULT;
 	}
 }
 
+static bool fpu_allocated(struct fpu *fpu)
+{
+	return fpu->state != NULL;
+}
+
+static inline int fpu_alloc(struct fpu *fpu)
+{
+	if (fpu_allocated(fpu))
+		return 0;
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	if (!fpu->state)
+		return -ENOMEM;
+	WARN_ON((unsigned long)fpu->state & 15);
+	return 0;
+}
+
+static inline void fpu_free(struct fpu *fpu)
+{
+	if (fpu->state) {
+		kmem_cache_free(task_xstate_cachep, fpu->state);
+		fpu->state = NULL;
+	}
+}
+
+static inline void fpu_copy(struct fpu *dst, struct fpu *src)
+{
+	memcpy(dst->state, src->state, xstate_size);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703..b684f587647 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -380,6 +380,10 @@ union thread_xstate {
 	struct xsave_struct		xsave;
 };
 
+struct fpu {
+	union thread_xstate *state;
+};
+
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
@@ -457,7 +461,7 @@ struct thread_struct {
 	unsigned long		trap_no;
 	unsigned long		error_code;
 	/* floating point and extended processor state */
-	union thread_xstate	*xstate;
+	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index ddc04ccad03..2c4390cae22 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -37,8 +37,9 @@ extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
 			    void __user *fpstate,
 			    struct _fpx_sw_bytes *sw);
 
-static inline int xrstor_checking(struct xsave_struct *fx)
+static inline int fpu_xrstor_checking(struct fpu *fpu)
 {
+	struct xsave_struct *fx = &fpu->state->xsave;
 	int err;
 
 	asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
@@ -110,12 +111,12 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
 		     :   "memory");
 }
 
-static inline void xsave(struct task_struct *tsk)
+static inline void fpu_xsave(struct fpu *fpu)
 {
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
 	__asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
-			     : : "D" (&(tsk->thread.xstate->xsave)),
+			     : : "D" (&(fpu->state->xsave)),
 				 "a" (-1), "d"(-1) : "memory");
 }
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 14ca1dc7a70..86cef6b3225 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,57 +107,57 @@ void __cpuinit fpu_init(void)
 }
 #endif	/* CONFIG_X86_64 */
 
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
+static void fpu_finit(struct fpu *fpu)
 {
-	if (tsk_used_math(tsk)) {
-		if (HAVE_HWFP && tsk == current)
-			unlazy_fpu(tsk);
-		return 0;
-	}
-
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	if (!tsk->thread.xstate) {
-		tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!tsk->thread.xstate)
-			return -ENOMEM;
-	}
-
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
-		memset(tsk->thread.xstate, 0, xstate_size);
-		finit_task(tsk);
-		set_stopped_child_used_math(tsk);
-		return 0;
+		finit_soft_fpu(&fpu->state->soft);
+		return;
 	}
 #endif
 
 	if (cpu_has_fxsr) {
-		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
 		memset(fx, 0, xstate_size);
 		fx->cwd = 0x37f;
 		if (cpu_has_xmm)
 			fx->mxcsr = MXCSR_DEFAULT;
 	} else {
-		struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+		struct i387_fsave_struct *fp = &fpu->state->fsave;
 		memset(fp, 0, xstate_size);
 		fp->cwd = 0xffff037fu;
 		fp->swd = 0xffff0000u;
 		fp->twd = 0xffffffffu;
 		fp->fos = 0xffff0000u;
 	}
+}
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+int init_fpu(struct task_struct *tsk)
+{
+	int ret;
+
+	if (tsk_used_math(tsk)) {
+		if (HAVE_HWFP && tsk == current)
+			unlazy_fpu(tsk);
+		return 0;
+	}
+
 	/*
-	 * Only the device not available exception or ptrace can call init_fpu.
+	 * Memory allocation at the first usage of the FPU and other state.
 	 */
+	ret = fpu_alloc(&tsk->thread.fpu);
+	if (ret)
+		return ret;
+
+	fpu_finit(&tsk->thread.fpu);
+
 	set_stopped_child_used_math(tsk);
 	return 0;
 }
@@ -191,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.xstate->fxsave, 0, -1);
+				   &target->thread.fpu.state->fxsave, 0, -1);
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -208,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->fxsave, 0, -1);
+				 &target->thread.fpu.state->fxsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
 	/*
 	 * update the header bits in the xsave header, indicating the
 	 * presence of FP and SSE state.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	return ret;
 }
@@ -243,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	 * memory layout in the thread struct, so that we can copy the entire
 	 * xstateregs to the user using one user_regset_copyout().
 	 */
-	memcpy(&target->thread.xstate->fxsave.sw_reserved,
+	memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
 	       xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
 
 	/*
 	 * Copy the xstate memory layout.
 	 */
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				  &target->thread.xstate->xsave, 0, -1);
+				  &target->thread.fpu.state->xsave, 0, -1);
 	return ret;
 }
 
@@ -269,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->xsave, 0, -1);
+				 &target->thread.fpu.state->xsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
-	xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+	xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
 
 	xsave_hdr->xstate_bv &= pcntxt_mask;
 	/*
@@ -362,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 static void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -402,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
 			    const struct user_i387_ia32_struct *env)
 
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -442,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.xstate->fsave, 0,
+					   &target->thread.fpu.state->fsave, 0,
 					   -1);
 	}
 
@@ -472,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.xstate->fsave, 0, -1);
+					  &target->thread.fpu.state->fsave, 0, -1);
 	}
 
 	if (pos > 0 || count < sizeof(env))
@@ -487,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
 	return ret;
 }
 
@@ -498,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+	struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
 
 	fp->status = fp->swd;
 	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -509,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
@@ -544,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
 	 * header as well as change any contents in the memory layout.
 	 * xrestore as part of sigreturn will capture all the changes.
 	 */
-	tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+	tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	if (save_i387_fxsave(fx) < 0)
 		return -1;
@@ -596,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
 
-	return __copy_from_user(&tsk->thread.xstate->fsave, buf,
+	return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
 				sizeof(struct i387_fsave_struct));
 }
 
@@ -607,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
 	struct user_i387_ia32_struct env;
 	int err;
 
-	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
+	err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
 			       size);
 	/* mxcsr reserved bits must be masked to zero for security reasons */
-	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
 		return 1;
 	convert_to_fxsr(tsk, &env);
@@ -626,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
 	struct i387_fxsave_struct __user *fx =
 		(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
 	struct xsave_hdr_struct *xsave_hdr =
-				&current->thread.xstate->xsave.xsave_hdr;
+				&current->thread.fpu.state->xsave.xsave_hdr;
 	u64 mask;
 	int err;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b9..f18fd9c1524 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,25 +32,22 @@ struct kmem_cache *task_xstate_cachep;
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+	int ret;
+
 	*dst = *src;
-	if (src->thread.xstate) {
-		dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!dst->thread.xstate)
-			return -ENOMEM;
-		WARN_ON((unsigned long)dst->thread.xstate & 15);
-		memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+	if (fpu_allocated(&src->thread.fpu)) {
+		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
+		ret = fpu_alloc(&dst->thread.fpu);
+		if (ret)
+			return ret;
+		fpu_copy(&dst->thread.fpu, &src->thread.fpu);
 	}
 	return 0;
 }
 
 void free_thread_xstate(struct task_struct *tsk)
 {
-	if (tsk->thread.xstate) {
-		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
-		tsk->thread.xstate = NULL;
-	}
-
+	fpu_free(&tsk->thread.fpu);
 	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30..0a7a4f5bbaa 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -317,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf..979215f5198 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -396,7 +396,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c1b0a11033a..37e68fc5e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -109,7 +109,7 @@ int save_i387_xstate(void __user *buf)
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else {
-		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
 				   xstate_size))
 			return -1;
 	}
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index aa098708877..62797f93051 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,10 +30,10 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit_task(struct task_struct *tsk)
+void finit_soft_fpu(struct i387_soft_struct *soft)
 {
-	struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
 	struct address *oaddr, *iaddr;
+	memset(soft, 0, sizeof(*soft));
 	soft->cwd = 0x037f;
 	soft->swd = 0;
 	soft->ftop = 0;	/* We don't keep top in the status word internally. */
@@ -52,7 +52,7 @@ void finit_task(struct task_struct *tsk)
 
 void finit(void)
 {
-	finit_task(current);
+	finit_task(&current->thread.fpu);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 829e92458532b1dbfeb972435d45bb060cdbf5a3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 27 Apr 2010 18:33:49 -0400
Subject: kprobes/x86: Fix removed int3 checking order

Fix kprobe/x86 to check removed int3 when failing to get kprobe
from hlist. Since we have a time window between checking int3
exists on probed address and getting kprobe on that address,
we can have following scenario:

 -------
 CPU1                     CPU2
 hit int3
 check int3 exists
                          remove int3
                          remove kprobe from hlist
 get kprobe from hlist
 no kprobe->OOPS!
 -------

This patch moves int3 checking if there is no kprobe on that
address for fixing this problem as follows:

 ------
 CPU1                     CPU2
 hit int3
                          remove int3
                          remove kprobe from hlist
 get kprobe from hlist
 no kprobe->check int3 exists
          ->rollback&retry
 ------

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100427223348.2322.9112.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c..1658efdfb4e 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -534,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb;
 
 	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
-	if (*addr != BREAKPOINT_INSTRUCTION) {
-		/*
-		 * The breakpoint instruction was removed right
-		 * after we hit it.  Another cpu has removed
-		 * either a probepoint or a debugger breakpoint
-		 * at this address.  In either case, no further
-		 * handling of this interrupt is appropriate.
-		 * Back up over the (now missing) int3 and run
-		 * the original instruction.
-		 */
-		regs->ip = (unsigned long)addr;
-		return 1;
-	}
-
 	/*
 	 * We don't want to be preempted for the entire
 	 * duration of kprobe processing. We conditionally
@@ -579,6 +565,19 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 				setup_singlestep(p, regs, kcb, 0);
 			return 1;
 		}
+	} else if (*addr != BREAKPOINT_INSTRUCTION) {
+		/*
+		 * The breakpoint instruction was removed right
+		 * after we hit it.  Another cpu has removed
+		 * either a probepoint or a debugger breakpoint
+		 * at this address.  In either case, no further
+		 * handling of this interrupt is appropriate.
+		 * Back up over the (now missing) int3 and run
+		 * the original instruction.
+		 */
+		regs->ip = (unsigned long)addr;
+		preempt_enable_no_resched();
+		return 1;
 	} else if (kprobe_running()) {
 		p = __get_cpu_var(current_kprobe);
 		if (p->break_handler && p->break_handler(p, regs)) {
-- 
cgit v1.2.3-70-g09d2


From a52357259680fe5368c2fabf5949209e231f2aa2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 11 May 2010 17:12:33 +0200
Subject: x86/amd-iommu: Add amd_iommu=off command line option

This patch adds a command line option to tell the AMD IOMMU
driver to not initialize any IOMMU it finds.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/kernel-parameters.txt | 2 ++
 arch/x86/kernel/amd_iommu_init.c    | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699..0c6c56076d1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -324,6 +324,8 @@ and is between 256 and 4096 characters. It is defined in the file
 				    they are unmapped. Otherwise they are
 				    flushed before they will be reused, which
 				    is a lot of faster
+			off	  - do not initialize any AMD IOMMU found in
+				    the system
 
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d..3bacb4d0844 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
 bool amd_iommu_dump;
 
 static int __initdata amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
 
 u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
@@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void)
 	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
 		return;
 
+	if (amd_iommu_disabled)
+		return;
+
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
 		amd_iommu_detected = 1;
@@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
 	for (; *str; ++str) {
 		if (strncmp(str, "fullflush", 9) == 0)
 			amd_iommu_unmap_flush = true;
+		if (strncmp(str, "off", 3) == 0)
+			amd_iommu_disabled = true;
 	}
 
 	return 1;
-- 
cgit v1.2.3-70-g09d2


From 720019908fd5a1bb442bb0a35a6027ba21864d25 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 12 May 2010 21:42:42 +0400
Subject: x86, perf: P4 PMU -- use hash for p4_get_escr_idx()

Linear search over all p4 MSRs should be fine if only
we would not use it in events scheduling routine which
is pretty time critical. Lets use hashes. It should speed
scheduling up significantly.

v2: Steven proposed to use more gentle approach than issue
    BUG on error, so we use WARN_ONCE now

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100512174242.GA5190@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 126 ++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 55 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a603930271f..cb875b1e2e8 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -668,66 +668,80 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	}
 }
 
-/* ESCRs are not sequential in memory so we need a map */
-static const unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
-	MSR_P4_ALF_ESCR0,	/*  0 */
-	MSR_P4_ALF_ESCR1,	/*  1 */
-	MSR_P4_BPU_ESCR0,	/*  2 */
-	MSR_P4_BPU_ESCR1,	/*  3 */
-	MSR_P4_BSU_ESCR0,	/*  4 */
-	MSR_P4_BSU_ESCR1,	/*  5 */
-	MSR_P4_CRU_ESCR0,	/*  6 */
-	MSR_P4_CRU_ESCR1,	/*  7 */
-	MSR_P4_CRU_ESCR2,	/*  8 */
-	MSR_P4_CRU_ESCR3,	/*  9 */
-	MSR_P4_CRU_ESCR4,	/* 10 */
-	MSR_P4_CRU_ESCR5,	/* 11 */
-	MSR_P4_DAC_ESCR0,	/* 12 */
-	MSR_P4_DAC_ESCR1,	/* 13 */
-	MSR_P4_FIRM_ESCR0,	/* 14 */
-	MSR_P4_FIRM_ESCR1,	/* 15 */
-	MSR_P4_FLAME_ESCR0,	/* 16 */
-	MSR_P4_FLAME_ESCR1,	/* 17 */
-	MSR_P4_FSB_ESCR0,	/* 18 */
-	MSR_P4_FSB_ESCR1,	/* 19 */
-	MSR_P4_IQ_ESCR0,	/* 20 */
-	MSR_P4_IQ_ESCR1,	/* 21 */
-	MSR_P4_IS_ESCR0,	/* 22 */
-	MSR_P4_IS_ESCR1,	/* 23 */
-	MSR_P4_ITLB_ESCR0,	/* 24 */
-	MSR_P4_ITLB_ESCR1,	/* 25 */
-	MSR_P4_IX_ESCR0,	/* 26 */
-	MSR_P4_IX_ESCR1,	/* 27 */
-	MSR_P4_MOB_ESCR0,	/* 28 */
-	MSR_P4_MOB_ESCR1,	/* 29 */
-	MSR_P4_MS_ESCR0,	/* 30 */
-	MSR_P4_MS_ESCR1,	/* 31 */
-	MSR_P4_PMH_ESCR0,	/* 32 */
-	MSR_P4_PMH_ESCR1,	/* 33 */
-	MSR_P4_RAT_ESCR0,	/* 34 */
-	MSR_P4_RAT_ESCR1,	/* 35 */
-	MSR_P4_SAAT_ESCR0,	/* 36 */
-	MSR_P4_SAAT_ESCR1,	/* 37 */
-	MSR_P4_SSU_ESCR0,	/* 38 */
-	MSR_P4_SSU_ESCR1,	/* 39 */
-	MSR_P4_TBPU_ESCR0,	/* 40 */
-	MSR_P4_TBPU_ESCR1,	/* 41 */
-	MSR_P4_TC_ESCR0,	/* 42 */
-	MSR_P4_TC_ESCR1,	/* 43 */
-	MSR_P4_U2L_ESCR0,	/* 44 */
-	MSR_P4_U2L_ESCR1,	/* 45 */
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE		0x000003a0
+#define P4_ESCR_MSR_MAX			0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE		(P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)		(msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)	[P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
 };
 
 static int p4_get_escr_idx(unsigned int addr)
 {
-	unsigned int i;
+	unsigned int idx = P4_ESCR_MSR_IDX(addr);
 
-	for (i = 0; i < ARRAY_SIZE(p4_escr_map); i++) {
-		if (addr == p4_escr_map[i])
-			return i;
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+			!p4_escr_table[idx])) {
+		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+		return -1;
 	}
 
-	return -1;
+	return idx;
 }
 
 static int p4_next_cntr(int thread, unsigned long *used_mask,
@@ -747,7 +761,7 @@ static int p4_next_cntr(int thread, unsigned long *used_mask,
 static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
+	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
 	int cpu = raw_smp_processor_id();
 	struct hw_perf_event *hwc;
 	struct p4_event_bind *bind;
@@ -755,7 +769,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 	int cntr_idx, escr_idx;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
+	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
 
 	for (i = 0, num = n; i < n; i++, num--) {
 
@@ -763,6 +777,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 		thread = p4_ht_thread(cpu);
 		bind = p4_config_get_bind(hwc->config);
 		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+		if (unlikely(escr_idx == -1))
+			goto done;
 
 		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
 			cntr_idx = hwc->idx;
-- 
cgit v1.2.3-70-g09d2


From f01487119dda3d9f58c9729c7361ecc50a61c188 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Tue, 27 Apr 2010 12:13:48 +0200
Subject: x86, amd: Check X86_FEATURE_OSVW bit before accessing OSVW MSRs

If host CPU is exposed to a guest the OSVW MSRs are not guaranteed
to be present and a GP fault occurs. Thus checking the feature flag is
essential.

Cc: <stable@kernel.org> # .32.x .33.x
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100427101348.GC4489@alberich.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/process.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b9..0415c3ef91b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
 		 * check OSVW bit for CPUs that are not affected
 		 * by erratum #400
 		 */
-		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-		if (val >= 2) {
-			rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-			if (!(val & BIT(1)))
-				goto no_c1e_idle;
+		if (cpu_has(c, X86_FEATURE_OSVW)) {
+			rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+			if (val >= 2) {
+				rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+				if (!(val & BIT(1)))
+					goto no_c1e_idle;
+			}
 		}
 		return 1;
 	}
-- 
cgit v1.2.3-70-g09d2


From 7f284d3cc96e02468a42e045f77af11e5ff8b095 Mon Sep 17 00:00:00 2001
From: Frank Arnold <frank.arnold@amd.com>
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

[ hpa: backported to 2.6.34 ]

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a5..95962a93f99 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -340,6 +340,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     (boot_cpu_data.x86_mask  < 0x1)))
 		return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.2.3-70-g09d2


From 1ff3d7d79204612ebe2e611d2592f8898908ca00 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 14 May 2010 23:08:15 +0400
Subject: x86, perf: P4 PMU - fix counters management logic

Jaswinder reported this #GP:

 |
 | Message from syslogd@ht at May 14 09:39:32 ...
 | kernel:[  314.908612] EIP: [<c100ccca>]
 | x86_perf_event_set_period+0x19d/0x1b2 SS:ESP 0068:edac3d70
 |

Ming has narrowed it down to a comparision issue
between arguments with different sizes and
signs. As result event index reached a wrong
value which in turn led to a GP fault.

At the same time it was found that p4_next_cntr
has broken logic and should return the counter
index only if it was not yet borrowed for
another event.

Reported-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Reported-by: Lin Ming <ming.m.lin@intel.com>
Bisected-by: Lin Ming <ming.m.lin@intel.com>
Tested-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100514190815.GG13509@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index cb875b1e2e8..424fc8de68e 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,7 +18,7 @@
 struct p4_event_bind {
 	unsigned int opcode;			/* Event code and ESCR selector */
 	unsigned int escr_msr[2];		/* ESCR MSR for this event */
-	unsigned char cntr[2][P4_CNTR_LIMIT];	/* counter index (offset), -1 on abscence */
+	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
 };
 
 struct p4_cache_event_bind {
@@ -747,11 +747,11 @@ static int p4_get_escr_idx(unsigned int addr)
 static int p4_next_cntr(int thread, unsigned long *used_mask,
 			struct p4_event_bind *bind)
 {
-	int i = 0, j;
+	int i, j;
 
 	for (i = 0; i < P4_CNTR_LIMIT; i++) {
-		j = bind->cntr[thread][i++];
-		if (j == -1 || !test_bit(j, used_mask))
+		j = bind->cntr[thread][i];
+		if (j != -1 && !test_bit(j, used_mask))
 			return j;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From fea24e28c663e62663097f0ed3b8ff1f9a87f15e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 14 May 2010 14:41:20 -0700
Subject: x86, mrst: add nop functions to x86_init mpparse functions

Moorestown does not have BIOS provided MP tables, we can save some time
by avoiding scaning of these tables. e.g.
[    0.000000] Scan SMP from c0000000 for 1024 bytes.
[    0.000000] Scan SMP from c009fc00 for 1024 bytes.
[    0.000000] Scan SMP from c00f0000 for 65536 bytes.
[    0.000000] Scan SMP from c00bfff0 for 1024 bytes.

Searching EBDA with the base at 0x40E will also result in random pointer
deferencing within 1MB. This can be a problem in Lincroft if the pointer
hits VGA area and VGA mode is not enabled.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273873281-17489-8-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/mrst.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858..e796448f0eb 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -237,4 +237,9 @@ void __init x86_mrst_early_setup(void)
 	x86_init.pci.fixup_irqs = x86_init_noop;
 
 	legacy_pic = &null_legacy_pic;
+
+	/* Avoid searching for BIOS MP tables */
+	x86_init.mpparse.find_smp_config = x86_init_noop;
+	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
 }
-- 
cgit v1.2.3-70-g09d2


From fec84e330719c20d2146c8dbdc9ff50b3a1d7039 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 17 May 2010 18:43:24 +0200
Subject: x86, hpet: Add reference to chipset erratum documentation for
 disable-hpet-msi-quirk

(At the moment the "SB700 Family Product Errata" document is available
at http://support.amd.com/us/Embedded_TechDocs/46837.pdf)

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100517164324.GB10254@alberich.amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/quirks.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index cd2c336c8d0..e72d3fc6547 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -495,6 +495,9 @@ void force_hpet_resume(void)
 /*
  * HPET MSI on some boards (ATI SB700/SB800) has side effect on
  * floppy DMA. Disable HPET MSI on such platforms.
+ * See erratum #27 (Misinterpreted MSI Requests May Result in
+ * Corrupted LPC DMA Data) in AMD Publication #46837,
+ * "SB700 Family Product Errata", Rev. 1.0, March 2010.
  *
  * Also force the read back of the CMP register in hpet_next_event()
  * to work around the problem that the CMP register write seems to be
-- 
cgit v1.2.3-70-g09d2


From cafd66595d92591e4bd25c3904e004fc6f897e2d Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Thu, 29 Apr 2010 12:09:01 -0400
Subject: KVM: VMX: enable VMXON check with SMX enabled (Intel TXT)

Per document, for feature control MSR:

  Bit 1 enables VMXON in SMX operation. If the bit is clear, execution
        of VMXON in SMX operation causes a general-protection exception.
  Bit 2 enables VMXON outside SMX operation. If the bit is clear, execution
        of VMXON outside SMX operation causes a general-protection exception.

This patch is to enable this kind of check with SMX for VMXON in KVM.

Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/msr-index.h |  5 +++--
 arch/x86/kernel/tboot.c          |  1 +
 arch/x86/kvm/vmx.c               | 32 +++++++++++++++++++++-----------
 include/linux/tboot.h            |  1 +
 4 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473acfa7f..f9324851eba 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -202,8 +202,9 @@
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
-#define FEATURE_CONTROL_LOCKED		(1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED	(1<<2)
+#define FEATURE_CONTROL_LOCKED				(1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX	(1<<2)
 
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48a..46b827778d1 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
 
 /* Global pointer to shared data; NULL means no measured launch. */
 struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
 
 /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
 #define AP_WAIT_TIMEOUT		1
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c4f3955c64e..d2a47aefdee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/moduleparam.h>
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
+#include <linux/tboot.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -1272,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED))
-	    == FEATURE_CONTROL_LOCKED;
+	if (msr & FEATURE_CONTROL_LOCKED) {
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+			&& tboot_enabled())
+			return 1;
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+			&& !tboot_enabled())
+			return 1;
+	}
+
+	return 0;
 	/* locked but not enabled */
 }
 
@@ -1282,21 +1290,23 @@ static int hardware_enable(void *garbage)
 {
 	int cpu = raw_smp_processor_id();
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	u64 old;
+	u64 old, test_bits;
 
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (FEATURE_CONTROL_LOCKED |
-		    FEATURE_CONTROL_VMXON_ENABLED))
-	    != (FEATURE_CONTROL_LOCKED |
-		FEATURE_CONTROL_VMXON_ENABLED))
+
+	test_bits = FEATURE_CONTROL_LOCKED;
+	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	if (tboot_enabled())
+		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+	if ((old & test_bits) != test_bits) {
 		/* enable and lock */
-		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED);
+		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index bf2a0c74887..1dba6ee5520 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -150,6 +150,7 @@ extern int tboot_force_iommu(void);
 
 #else
 
+#define tboot_enabled()			0
 #define tboot_probe()			do { } while (0)
 #define tboot_shutdown(shutdown_type)	do { } while (0)
 #define tboot_sleep(sleep_state, pm1a_control, pm1b_control)	\
-- 
cgit v1.2.3-70-g09d2


From 424c32f1aa3112632a657d45698c8e7666668f78 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:39 -0400
Subject: x86, paravirt: Enable pvclock flags in vcpu_time_info structure

This patch removes one padding byte and transform it into a flags
field. New versions of guests using pvclock will query these flags
upon each read.

Flags, however, will only be interpreted when the guest decides to.
It uses the pvclock_valid_flags function to signal that a specific
set of flags should be taken into consideration. Which flags are valid
are usually devised via HV negotiation.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: Jeremy Fitzhardinge <jeremy@goop.org>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/pvclock-abi.h | 3 ++-
 arch/x86/include/asm/pvclock.h     | 1 +
 arch/x86/kernel/pvclock.c          | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f262..ec5c41ac4be 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
 	u64   system_time;
 	u32   tsc_to_system_mul;
 	s8    tsc_shift;
-	u8    pad[3];
+	u8    flags;
+	u8    pad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 
 struct pvclock_wall_clock {
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8c..cd02f324aa6 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761..f7fdd56bc0a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
 	u32 tsc_to_nsec_mul;
 	int tsc_shift;
 	u32 version;
+	u8  flags;
 };
 
+static u8 valid_flags __read_mostly = 0;
+
+void pvclock_set_flags(u8 flags)
+{
+	valid_flags = flags;
+}
+
 /*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 		dst->system_timestamp  = src->system_time;
 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
 		dst->tsc_shift         = src->tsc_shift;
+		dst->flags             = src->flags;
 		rmb();		/* test version after fetching data */
 	} while ((src->version & 1) || (dst->version != src->version));
 
-- 
cgit v1.2.3-70-g09d2


From 489fb490dbf8dab0249ad82b56688ae3842a79e8 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:40 -0400
Subject: x86, paravirt: Add a global synchronization point for pvclock

In recent stress tests, it was found that pvclock-based systems
could seriously warp in smp systems. Using ingo's time-warp-test.c,
I could trigger a scenario as bad as 1.5mi warps a minute in some systems.
(to be fair, it wasn't that bad in most of them). Investigating further, I
found out that such warps were caused by the very offset-based calculation
pvclock is based on.

This happens even on some machines that report constant_tsc in its tsc flags,
specially on multi-socket ones.

Two reads of the same kernel timestamp at approx the same time, will likely
have tsc timestamped in different occasions too. This means the delta we
calculate is unpredictable at best, and can probably be smaller in a cpu
that is legitimately reading clock in a forward ocasion.

Some adjustments on the host could make this window less likely to happen,
but still, it pretty much poses as an intrinsic problem of the mechanism.

A while ago, I though about using a shared variable anyway, to hold clock
last state, but gave up due to the high contention locking was likely
to introduce, possibly rendering the thing useless on big machines. I argue,
however, that locking is not necessary.

We do a read-and-return sequence in pvclock, and between read and return,
the global value can have changed. However, it can only have changed
by means of an addition of a positive value. So if we detected that our
clock timestamp is less than the current global, we know that we need to
return a higher one, even though it is not exactly the one we compared to.

OTOH, if we detect we're greater than the current time source, we atomically
replace the value with our new readings. This do causes contention on big
boxes (but big here means *BIG*), but it seems like a good trade off, since
it provide us with a time source guaranteed to be stable wrt time warps.

After this patch is applied, I don't see a single warp in time during 5 days
of execution, in any of the machines I saw them before.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
CC: Jeremy Fitzhardinge <jeremy@goop.org>
CC: Avi Kivity <avi@redhat.com>
CC: Marcelo Tosatti <mtosatti@redhat.com>
CC: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kernel/pvclock.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index f7fdd56bc0a..f5bc40e1697 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -118,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 	return pv_tsc_khz;
 }
 
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	struct pvclock_shadow_time shadow;
 	unsigned version;
 	cycle_t ret, offset;
+	u64 last;
 
 	do {
 		version = pvclock_get_time_values(&shadow, src);
@@ -132,6 +135,27 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 		barrier();
 	} while (version != src->version);
 
+	/*
+	 * Assumption here is that last_value, a global accumulator, always goes
+	 * forward. If we are less than that, we should not be much smaller.
+	 * We assume there is an error marging we're inside, and then the correction
+	 * does not sacrifice accuracy.
+	 *
+	 * For reads: global may have changed between test and return,
+	 * but this means someone else updated poked the clock at a later time.
+	 * We just need to make sure we are not seeing a backwards event.
+	 *
+	 * For updates: last_value = ret is not enough, since two vcpus could be
+	 * updating at the same time, and one of them could be slightly behind,
+	 * making the assumption that last_value always go forward fail to hold.
+	 */
+	last = atomic64_read(&last_value);
+	do {
+		if (ret < last)
+			return last;
+		last = atomic64_cmpxchg(&last_value, last, ret);
+	} while (unlikely(last != ret));
+
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 838815a78785022f6611e5c48386567aea7b818b Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:44 -0400
Subject: x86: KVM guest: Try using new kvm clock msrs

We now added a new set of clock-related msrs in replacement of the old
ones. In theory, we could just try to use them and get a return value
indicating they do not exist, due to our use of kvm_write_msr_save.

However, kvm clock registration happens very early, and if we ever
try to write to a non-existant MSR, we raise a lethal #GP, since our
idt handlers are not in place yet.

So this patch tests for a cpuid feature exported by the host to
decide which set of msrs are supported.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 53 ++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4..59c740fcb9b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
 #define KVM_SCALE 22
 
 static int kvmclock = 1;
+static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
 
 	low = (int)__pa_symbol(&wall_clock);
 	high = ((u64)__pa_symbol(&wall_clock) >> 32);
-	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+
+	native_write_msr(msr_kvm_wall_clock, low, high);
 
 	vcpu_time = &get_cpu_var(hv_clock);
 	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
-	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+
+	return native_write_msr_safe(msr_kvm_system_time, low, high);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
 #ifdef CONFIG_KEXEC
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_crash_shutdown(regs);
 }
 #endif
 
 static void kvm_shutdown(void)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_shutdown();
 }
 
@@ -181,27 +185,34 @@ void __init kvmclock_init(void)
 	if (!kvm_para_available())
 		return;
 
-	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
-		if (kvm_register_clock("boot clock"))
-			return;
-		pv_time_ops.sched_clock = kvm_clock_read;
-		x86_platform.calibrate_tsc = kvm_get_tsc_khz;
-		x86_platform.get_wallclock = kvm_get_wallclock;
-		x86_platform.set_wallclock = kvm_set_wallclock;
+	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+	} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
+		return;
+
+	printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+		msr_kvm_system_time, msr_kvm_wall_clock);
+
+	if (kvm_register_clock("boot clock"))
+		return;
+	pv_time_ops.sched_clock = kvm_clock_read;
+	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+	x86_platform.get_wallclock = kvm_get_wallclock;
+	x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		x86_cpuinit.setup_percpu_clockev =
-			kvm_setup_secondary_clock;
+	x86_cpuinit.setup_percpu_clockev =
+		kvm_setup_secondary_clock;
 #endif
 #ifdef CONFIG_SMP
-		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 #endif
-		machine_ops.shutdown  = kvm_shutdown;
+	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
-		machine_ops.crash_shutdown  = kvm_crash_shutdown;
+	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
-		kvm_get_preset_lpj();
-		clocksource_register(&kvm_clock);
-		pv_info.paravirt_enabled = 1;
-		pv_info.name = "KVM";
-	}
+	kvm_get_preset_lpj();
+	clocksource_register(&kvm_clock);
+	pv_info.paravirt_enabled = 1;
+	pv_info.name = "KVM";
 }
-- 
cgit v1.2.3-70-g09d2


From 3a0d7256a6fb8c13f9fac6cd63250f97a8f0d8de Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:45 -0400
Subject: x86, paravirt: don't compute pvclock adjustments if we trust the tsc

If the HV told us we can fully trust the TSC, skip any
correction

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h    | 5 +++++
 arch/x86/include/asm/pvclock-abi.h | 1 +
 arch/x86/kernel/kvmclock.c         | 3 +++
 arch/x86/kernel/pvclock.c          | 4 ++++
 4 files changed, 13 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index f019f8cb182..05eba5e9a8e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,11 @@
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
 
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT	24
+
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index ec5c41ac4be..35f2d1948ad 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -39,5 +39,6 @@ struct pvclock_wall_clock {
 	u32   nsec;
 } __attribute__((__packed__));
 
+#define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 59c740fcb9b..eb9b76c716c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -215,4 +215,7 @@ void __init kvmclock_init(void)
 	clocksource_register(&kvm_clock);
 	pv_info.paravirt_enabled = 1;
 	pv_info.name = "KVM";
+
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 }
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index f5bc40e1697..239427ca02a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -135,6 +135,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 		barrier();
 	} while (version != src->version);
 
+	if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+		(shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+		return ret;
+
 	/*
 	 * Assumption here is that last_value, a global accumulator, always goes
 	 * forward. If we are less than that, we should not be much smaller.
-- 
cgit v1.2.3-70-g09d2


From dcc7871128e99458ca86186b7bc8bf27ff0c47b5 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 May 2010 21:04:21 -0500
Subject: kgdb: core changes to support kdb

These are the minimum changes to the kgdb core in order to enable an
API to connect a new front end (kdb) to the debug core.

This patch introduces the dbg_kdb_mode variable controls where the
user level I/O is routed.  It will be routed to the gdbstub (kgdb) or
to the kdb front end which is a simple shell available over the kgdboc
connection.

You can switch back and forth between kdb or the gdb stub mode of
operation dynamically.  From gdb stub mode you can blindly type
"$3#33", or from the kdb mode you can enter "kgdb" to switch to the
gdb stub.

The logic in the debug core depends on kdb to look for the typical gdb
connection sequences and return immediately with KGDB_PASS_EVENT if a
gdb serial command sequence is detected.  That should allow a
reasonably seamless transition between kdb -> gdb without leaving the
kernel exception state.  The two gdb serial queries that kdb is
responsible for detecting are the "?" and "qSupported" packets.

CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Martin Hicks <mort@sgi.com>
---
 arch/arm/kernel/kgdb.c     |   5 +++
 arch/mips/kernel/kgdb.c    |   5 +++
 arch/powerpc/kernel/kgdb.c |   5 +++
 arch/x86/kernel/kgdb.c     |   5 +++
 include/linux/kgdb.h       |  11 ++++-
 kernel/debug/debug_core.c  | 107 ++++++++++++++++++++++++++++++++++++++-------
 kernel/debug/debug_core.h  |  24 ++++++++++
 kernel/debug/gdbstub.c     |  36 +++++++++++++++
 lib/Kconfig.kgdb           |   8 +++-
 9 files changed, 186 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index a5b846b9895..c868a886411 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -98,6 +98,11 @@ sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
 	gdb_regs[_CPSR]		= thread_regs->ARM_cpsr;
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->ARM_pc = pc;
+}
+
 static int compiled_break;
 
 int kgdb_arch_handle_exception(int exception_vector, int signo,
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index 50c9bb88066..6ed4c83c869 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -180,6 +180,11 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	*(ptr++) = regs->cp0_epc;
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->cp0_epc = pc;
+}
+
 /*
  * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
  * then try to fall into the debugger
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 41bada0298c..c81e3de1306 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -309,6 +309,11 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	       (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->nip = pc;
+}
+
 /*
  * This function does PowerPC specific procesing for interfacing to gdb.
  */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b2258ca9100..f95a2c0b915 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -690,6 +690,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
 	return instruction_pointer(regs);
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
+
 struct kgdb_arch arch_kgdb_ops = {
 	/* Breakpoint instruction: */
 	.gdb_bpt_instr		= { 0xcc },
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 4830142ec33..5b37df00000 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -16,10 +16,12 @@
 #include <linux/serial_8250.h>
 #include <linux/linkage.h>
 #include <linux/init.h>
-
 #include <asm/atomic.h>
+#ifdef CONFIG_HAVE_ARCH_KGDB
 #include <asm/kgdb.h>
+#endif
 
+#ifdef CONFIG_KGDB
 struct pt_regs;
 
 /**
@@ -262,6 +264,7 @@ extern struct kgdb_arch		arch_kgdb_ops;
 
 extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs);
 
+extern void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc);
 extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern struct kgdb_io *dbg_io_ops;
@@ -279,5 +282,9 @@ extern int kgdb_nmicallback(int cpu, void *regs);
 
 extern int			kgdb_single_step;
 extern atomic_t			kgdb_active;
-
+#define in_dbg_master() \
+	(raw_smp_processor_id() == atomic_read(&kgdb_active))
+#else /* ! CONFIG_KGDB */
+#define in_dbg_master() (0)
+#endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7e03969330b..6e1fa829fde 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -43,6 +43,7 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
+#include <linux/kdb.h>
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
@@ -77,6 +78,11 @@ static DEFINE_SPINLOCK(kgdb_registration_lock);
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
 static int kgdb_use_con;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+static int dbg_kdb_mode = 1;
 
 static int __init opt_kgdb_con(char *str)
 {
@@ -100,6 +106,7 @@ static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
  * The CPU# of the active CPU, or -1 if none:
  */
 atomic_t			kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
 
 /*
  * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
@@ -301,7 +308,7 @@ int dbg_set_sw_break(unsigned long addr)
 	return 0;
 }
 
-static int kgdb_deactivate_sw_breakpoints(void)
+int dbg_deactivate_sw_breakpoints(void)
 {
 	unsigned long addr;
 	int error;
@@ -395,8 +402,14 @@ static int kgdb_io_ready(int print_wait)
 		return 1;
 	if (atomic_read(&kgdb_setting_breakpoint))
 		return 1;
-	if (print_wait)
+	if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
 		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+	}
 	return 1;
 }
 
@@ -410,7 +423,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	/* Panic on recursive debugger calls: */
 	exception_level++;
 	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
-	kgdb_deactivate_sw_breakpoints();
+	dbg_deactivate_sw_breakpoints();
 
 	/*
 	 * If the break point removed ok at the place exception
@@ -443,11 +456,24 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	return 1;
 }
 
+static void dbg_cpu_switch(int cpu, int next_cpu)
+{
+	/* Mark the cpu we are switching away from as a slave when it
+	 * holds the kgdb_active token.  This must be done so that the
+	 * that all the cpus wait in for the debug core will not enter
+	 * again as the master. */
+	if (cpu == atomic_read(&kgdb_active)) {
+		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+		kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
+	}
+	kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
+}
+
 static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
 {
 	unsigned long flags;
 	int sstep_tries = 100;
-	int error = 0;
+	int error;
 	int i, cpu;
 	int trace_on = 0;
 acquirelock:
@@ -460,6 +486,8 @@ acquirelock:
 	cpu = ks->cpu;
 	kgdb_info[cpu].debuggerinfo = regs;
 	kgdb_info[cpu].task = current;
+	kgdb_info[cpu].ret_state = 0;
+	kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
 	/*
 	 * Make sure the above info reaches the primary CPU before
 	 * our cpu_in_kgdb[] flag setting does:
@@ -471,7 +499,11 @@ acquirelock:
 	 * master cpu and acquire the kgdb_active lock:
 	 */
 	while (1) {
-		if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+cpu_loop:
+		if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+			kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+			goto cpu_master_loop;
+		} else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
 			if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
 				break;
 		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
@@ -513,7 +545,7 @@ return_normal:
 	}
 
 	if (!kgdb_io_ready(1)) {
-		error = 1;
+		kgdb_info[cpu].ret_state = 1;
 		goto kgdb_restore; /* No I/O connection, resume the system */
 	}
 
@@ -548,7 +580,7 @@ return_normal:
 	 * Wait for the other CPUs to be notified and be waiting for us:
 	 */
 	for_each_online_cpu(i) {
-		while (!atomic_read(&cpu_in_kgdb[i]))
+		while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
 			cpu_relax();
 	}
 
@@ -557,7 +589,7 @@ return_normal:
 	 * in the debugger and all secondary CPUs are quiescent
 	 */
 	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
-	kgdb_deactivate_sw_breakpoints();
+	dbg_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
 	kgdb_contthread = current;
 	exception_level = 0;
@@ -565,8 +597,26 @@ return_normal:
 	if (trace_on)
 		tracing_off();
 
-	/* Talk to debugger with gdbserial protocol */
-	error = gdb_serial_stub(ks);
+	while (1) {
+cpu_master_loop:
+		if (dbg_kdb_mode) {
+			kgdb_connected = 1;
+			error = kdb_stub(ks);
+		} else {
+			error = gdb_serial_stub(ks);
+		}
+
+		if (error == DBG_PASS_EVENT) {
+			dbg_kdb_mode = !dbg_kdb_mode;
+			kgdb_connected = 0;
+		} else if (error == DBG_SWITCH_CPU_EVENT) {
+			dbg_cpu_switch(cpu, dbg_switch_cpu);
+			goto cpu_loop;
+		} else {
+			kgdb_info[cpu].ret_state = error;
+			break;
+		}
+	}
 
 	/* Call the I/O driver's post_exception routine */
 	if (dbg_io_ops->post_exception)
@@ -578,11 +628,16 @@ return_normal:
 		for (i = NR_CPUS-1; i >= 0; i--)
 			atomic_dec(&passive_cpu_wait[i]);
 		/*
-		 * Wait till all the CPUs have quit
-		 * from the debugger.
+		 * Wait till all the CPUs have quit from the debugger,
+		 * but allow a CPU that hit an exception and is
+		 * waiting to become the master to remain in the debug
+		 * core.
 		 */
 		for_each_online_cpu(i) {
-			while (atomic_read(&cpu_in_kgdb[i]))
+			while (kgdb_do_roundup &&
+			       atomic_read(&cpu_in_kgdb[i]) &&
+			       !(kgdb_info[i].exception_state &
+				 DCPU_WANT_MASTER))
 				cpu_relax();
 		}
 	}
@@ -603,7 +658,7 @@ kgdb_restore:
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 
-	return error;
+	return kgdb_info[cpu].ret_state;
 }
 
 /*
@@ -632,7 +687,8 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 		return 0; /* Ouch, double exception ! */
 	kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
 	ret = kgdb_cpu_enter(ks, regs);
-	kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
+	kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
+						DCPU_IS_SLAVE);
 	return ret;
 }
 
@@ -665,7 +721,7 @@ static void kgdb_console_write(struct console *co, const char *s,
 
 	/* If we're debugging, or KGDB has not connected, don't try
 	 * and print. */
-	if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
+	if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
 		return;
 
 	local_irq_save(flags);
@@ -687,8 +743,14 @@ static void sysrq_handle_dbg(int key, struct tty_struct *tty)
 		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
 		return;
 	}
-	if (!kgdb_connected)
+	if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
 		printk(KERN_CRIT "Entering KGDB\n");
+#endif
+	}
 
 	kgdb_breakpoint();
 }
@@ -817,6 +879,16 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
 }
 EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 
+int dbg_io_get_char(void)
+{
+	int ret = dbg_io_ops->read_char();
+	if (!dbg_kdb_mode)
+		return ret;
+	if (ret == 127)
+		return 8;
+	return ret;
+}
+
 /**
  * kgdb_breakpoint - generate breakpoint exception
  *
@@ -839,6 +911,7 @@ static int __init opt_kgdb_wait(char *str)
 {
 	kgdb_break_asap = 1;
 
+	kdb_init(KDB_INIT_EARLY);
 	if (kgdb_io_module_registered)
 		kgdb_initial_breakpoint();
 
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index db554f9be51..44cf3de8cf9 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -38,6 +38,8 @@ struct debuggerinfo_struct {
 	void			*debuggerinfo;
 	struct task_struct	*task;
 	int			exception_state;
+	int			ret_state;
+	int			irq_depth;
 };
 
 extern struct debuggerinfo_struct kgdb_info[];
@@ -47,9 +49,31 @@ extern int dbg_remove_all_break(void);
 extern int dbg_set_sw_break(unsigned long addr);
 extern int dbg_remove_sw_break(unsigned long addr);
 extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
 
 /* gdbstub interface functions */
 extern int gdb_serial_stub(struct kgdb_state *ks);
 extern void gdbstub_msg_write(const char *s, int len);
 
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+	return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
 #endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ccdf0929f12..188203a1965 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -887,6 +887,13 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		case 'Z': /* Break point set */
 			gdb_cmd_break(ks);
 			break;
+#ifdef CONFIG_KGDB_KDB
+		case '3': /* Escape into back into kdb */
+			if (remcom_in_buffer[1] == '\0') {
+				gdb_cmd_detachkill(ks);
+				return DBG_PASS_EVENT;
+			}
+#endif
 		case 'C': /* Exception passing */
 			tmp = gdb_cmd_exception_pass(ks);
 			if (tmp > 0)
@@ -932,3 +939,32 @@ kgdb_exit:
 		error = 1;
 	return error;
 }
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+	int error;
+
+	switch (cmd[0]) {
+	case 'e':
+		error = kgdb_arch_handle_exception(ks->ex_vector,
+						   ks->signo,
+						   ks->err_code,
+						   remcom_in_buffer,
+						   remcom_out_buffer,
+						   ks->linux_regs);
+		return error;
+	case 's':
+	case 'c':
+		strcpy(remcom_in_buffer, cmd);
+		return 0;
+	case '?':
+		gdb_cmd_status(ks);
+		break;
+	case '\0':
+		strcpy(remcom_out_buffer, "");
+		break;
+	}
+	dbg_io_ops->write_char('+');
+	put_packet(remcom_out_buffer);
+	return 0;
+}
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index 9b5d1d7f2ef..78de43a5e90 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -3,7 +3,7 @@ config HAVE_ARCH_KGDB
 	bool
 
 menuconfig KGDB
-	bool "KGDB: kernel debugging with remote gdb"
+	bool "KGDB: kernel debugger"
 	depends on HAVE_ARCH_KGDB
 	depends on DEBUG_KERNEL && EXPERIMENTAL
 	help
@@ -57,4 +57,10 @@ config KGDB_TESTS_BOOT_STRING
 	  information about other strings you could use beyond the
 	  default of V1F100.
 
+config KGDB_KDB
+	bool "KGDB_KDB: include kdb frontend for kgdb"
+	default n
+	help
+	  KDB frontend for kernel
+
 endif # KGDB
-- 
cgit v1.2.3-70-g09d2


From 98ec1878cacb393975cba64f7392eece81716cb4 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 27 Apr 2009 10:58:06 -0500
Subject: kgdb: remove post_primary_code references

Remove all the references to the kgdb_post_primary_code.  This
function serves no useful purpose because you can obtain the same
information from the "struct kgdb_state *ks" from with in the
debugger, if for some reason you want the data.

Also remove the unintentional duplicate assignment for ks->ex_vector.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c    | 29 -----------------------------
 include/linux/kgdb.h      | 14 --------------
 kernel/debug/debug_core.c |  8 --------
 3 files changed, 51 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f95a2c0b915..acba5716993 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -47,20 +47,8 @@
 #include <asm/debugreg.h>
 #include <asm/apicdef.h>
 #include <asm/system.h>
-
 #include <asm/apic.h>
 
-/*
- * Put the error code here just in case the user cares:
- */
-static int gdb_x86errcode;
-
-/*
- * Likewise, the vector number here (since GDB only gets the signal
- * number through the usual means, and that's not very specific):
- */
-static int gdb_x86vector = -1;
-
 /**
  *	pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
  *	@gdb_regs: A pointer to hold the registers in the order GDB wants.
@@ -399,23 +387,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
 	}
 }
 
-/**
- *	kgdb_post_primary_code - Save error vector/code numbers.
- *	@regs: Original pt_regs.
- *	@e_vector: Original error vector.
- *	@err_code: Original error code.
- *
- *	This is needed on architectures which support SMP and KGDB.
- *	This function is called after all the slave cpus have been put
- *	to a know spin state and the primary CPU has control over KGDB.
- */
-void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	/* primary processor is completely in the debugger */
-	gdb_x86vector = e_vector;
-	gdb_x86errcode = err_code;
-}
-
 #ifdef CONFIG_SMP
 /**
  *	kgdb_roundup_cpus - Get other CPUs into a holding pattern
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 407edb1e0c4..406f6f9286f 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -35,20 +35,6 @@ struct pt_regs;
  */
 extern int kgdb_skipexception(int exception, struct pt_regs *regs);
 
-/**
- *	kgdb_post_primary_code - (optional) Save error vector/code numbers.
- *	@regs: Original pt_regs.
- *	@e_vector: Original error vector.
- *	@err_code: Original error code.
- *
- *	This is usually needed on architectures which support SMP and
- *	KGDB.  This function is called after all the secondary cpus have
- *	been put to a know spin state and the primary CPU has control over
- *	KGDB.
- */
-extern void kgdb_post_primary_code(struct pt_regs *regs, int e_vector,
-				  int err_code);
-
 /**
  *	kgdb_disable_hw_debug - (optional) Disable hardware debugging hook
  *	@regs: Current &struct pt_regs.
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1aed37b4c56..88a83a22537 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -203,12 +203,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
 	return 0;
 }
 
-void __weak
-kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	return;
-}
-
 /**
  *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
  *	@regs: Current &struct pt_regs.
@@ -588,7 +582,6 @@ return_normal:
 	 * At this point the primary processor is completely
 	 * in the debugger and all secondary CPUs are quiescent
 	 */
-	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
 	dbg_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
 	kgdb_contthread = current;
@@ -678,7 +671,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 	ks->cpu			= raw_smp_processor_id();
 	ks->ex_vector		= evector;
 	ks->signo		= signo;
-	ks->ex_vector		= evector;
 	ks->err_code		= ecode;
 	ks->kgdb_usethreadid	= 0;
 	ks->linux_regs		= regs;
-- 
cgit v1.2.3-70-g09d2


From f503b5ae53cb557ac351a668fcac1baab1cef0db Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 May 2010 21:04:25 -0500
Subject: x86,kgdb: Add low level debug hook

The only way the debugger can handle a trap in inside rcu_lock,
notify_die, or atomic_notifier_call_chain without a triple fault is
to have a low level "first opportunity handler" in the int3 exception
handler.

Generally this will be something the vast majority of folks will not
need, but for those who need it, it is added as a kernel .config
option called KGDB_LOW_LEVEL_TRAP.

CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: H. Peter Anvin <hpa@zytor.com>
CC: x86@kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/kgdb.h |  3 +++
 arch/x86/kernel/kgdb.c      | 22 +++++++++++++++++++++-
 arch/x86/kernel/traps.c     |  6 ++++++
 include/linux/kgdb.h        |  1 +
 kernel/debug/debug_core.c   |  2 +-
 lib/Kconfig.kgdb            |  9 +++++++++
 6 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index e6c6c808489..006da3687cd 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -76,4 +76,7 @@ static inline void arch_kgdb_breakpoint(void)
 #define BREAK_INSTR_SIZE	1
 #define CACHE_FLUSH_IS_SAFE	1
 
+extern int kgdb_ll_trap(int cmd, const char *str,
+			struct pt_regs *regs, long err, int trap, int sig);
+
 #endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index acba5716993..95b89d4cb8f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -538,7 +538,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 			return NOTIFY_DONE;
 	}
 
-	if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs))
+	if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
 		return NOTIFY_DONE;
 
 	/* Must touch watchdog before return to normal operation */
@@ -546,6 +546,26 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 	return NOTIFY_STOP;
 }
 
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+int kgdb_ll_trap(int cmd, const char *str,
+		 struct pt_regs *regs, long err, int trap, int sig)
+{
+	struct die_args args = {
+		.regs	= regs,
+		.str	= str,
+		.err	= err,
+		.trapnr	= trap,
+		.signr	= sig,
+
+	};
+
+	if (!kgdb_io_module_registered)
+		return NOTIFY_DONE;
+
+	return __kgdb_notify(&args, cmd);
+}
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
 static int
 kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
 {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 02cfb9b8f5b..7eaad4c5110 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/kgdb.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
@@ -451,6 +452,11 @@ void restart_nmi(void)
 /* May run on IST stack. */
 dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
+		return;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 #ifdef CONFIG_KPROBES
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 406f6f9286f..19d1b29a269 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -60,6 +60,7 @@ struct uart_port;
 void kgdb_breakpoint(void);
 
 extern int kgdb_connected;
+extern int kgdb_io_module_registered;
 
 extern atomic_t			kgdb_setting_breakpoint;
 extern atomic_t			kgdb_cpu_doing_single_step;
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 88a83a22537..375e42f0baf 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -66,7 +66,7 @@ int				kgdb_connected;
 EXPORT_SYMBOL_GPL(kgdb_connected);
 
 /* All the KGDB handlers are installed */
-static int			kgdb_io_module_registered;
+int			kgdb_io_module_registered;
 
 /* Guard for recursive entry */
 static int			exception_level;
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index ee8ae7132f2..c56ccb4ad29 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -57,6 +57,15 @@ config KGDB_TESTS_BOOT_STRING
 	  information about other strings you could use beyond the
 	  default of V1F100.
 
+config KGDB_LOW_LEVEL_TRAP
+       bool "KGDB: Allow debugging with traps in notifiers"
+       depends on X86
+       default n
+       help
+         This will add an extra call back to kgdb for the breakpoint
+         exception handler on which will will allow kgdb to step
+         through a notify handler.
+
 config KGDB_KDB
 	bool "KGDB_KDB: include kdb frontend for kgdb"
 	default n
-- 
cgit v1.2.3-70-g09d2


From 29c843912a0baa7fa63033fe28e1ca7e796686a5 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Thu, 20 May 2010 21:04:29 -0500
Subject: x86, kgdb: early trap init for early debug

Allow the x86 arch to have early exception processing for the purpose
of debugging via the kgdb.

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/processor.h |  2 ++
 arch/x86/kernel/setup.c          |  1 +
 arch/x86/kernel/traps.c          | 14 ++++++++++----
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5a51379dcbe..7e5c6a60b8e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -789,6 +789,8 @@ static inline void wbinvd_halt(void)
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
+extern void early_trap_init(void);
+
 /* Defined in head.S */
 extern struct desc_ptr		early_gdt_descr;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4851eff57b..e8029896309 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -725,6 +725,7 @@ void __init setup_arch(char **cmdline_p)
 	/* VMI may relocate the fixmap; do this before touching ioremap area */
 	vmi_init();
 
+	early_trap_init();
 	early_cpu_init();
 	early_ioremap_init();
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7eaad4c5110..142d70c74b0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -808,6 +808,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 }
 #endif
 
+/* Set of traps needed for early debugging. */
+void __init early_trap_init(void)
+{
+	set_intr_gate_ist(1, &debug, DEBUG_STACK);
+	/* int3 can be called from all */
+	set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
+	set_intr_gate(14, &page_fault);
+	load_idt(&idt_descr);
+}
+
 void __init trap_init(void)
 {
 	int i;
@@ -821,10 +831,7 @@ void __init trap_init(void)
 #endif
 
 	set_intr_gate(0, &divide_error);
-	set_intr_gate_ist(1, &debug, DEBUG_STACK);
 	set_intr_gate_ist(2, &nmi, NMI_STACK);
-	/* int3 can be called from all */
-	set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
 	/* int4 can be called from all */
 	set_system_intr_gate(4, &overflow);
 	set_intr_gate(5, &bounds);
@@ -840,7 +847,6 @@ void __init trap_init(void)
 	set_intr_gate(11, &segment_not_present);
 	set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
 	set_intr_gate(13, &general_protection);
-	set_intr_gate(14, &page_fault);
 	set_intr_gate(15, &spurious_interrupt_bug);
 	set_intr_gate(16, &coprocessor_error);
 	set_intr_gate(17, &alignment_check);
-- 
cgit v1.2.3-70-g09d2


From 0b4b3827db386ec6034a5aba1261025b039440c2 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 May 2010 21:04:29 -0500
Subject: x86, kgdb, init: Add early and late debug states

The kernel debugger can operate well before mm_init(), but the x86
hardware breakpoint code which uses the perf api requires that the
kernel allocators are initialized.

This means the kernel debug core needs to provide an optional arch
specific call back to allow the initialization functions to run after
the kernel has been further initialized.

The kdb shell already had a similar restriction with an early
initialization and late initialization.  The kdb_init() was moved into
the debug core's version of the late init which is called
dbg_late_init();

CC: kgdb-bugreport@lists.sourceforge.net
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c    | 17 ++++++++++-------
 include/linux/kgdb.h      | 14 ++++++++++++++
 init/main.c               |  4 ++--
 kernel/debug/debug_core.c | 16 ++++++++++++++++
 4 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 95b89d4cb8f..2b71ec41869 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -595,15 +595,16 @@ static struct notifier_block kgdb_notifier = {
  *	specific callbacks.
  */
 int kgdb_arch_init(void)
+{
+	return register_die_notifier(&kgdb_notifier);
+}
+
+void kgdb_arch_late(void)
 {
 	int i, cpu;
-	int ret;
 	struct perf_event_attr attr;
 	struct perf_event **pevent;
 
-	ret = register_die_notifier(&kgdb_notifier);
-	if (ret != 0)
-		return ret;
 	/*
 	 * Pre-allocate the hw breakpoint structions in the non-atomic
 	 * portion of kgdb because this operation requires mutexs to
@@ -615,12 +616,15 @@ int kgdb_arch_init(void)
 	attr.bp_type = HW_BREAKPOINT_W;
 	attr.disabled = 1;
 	for (i = 0; i < 4; i++) {
+		if (breakinfo[i].pev)
+			continue;
 		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
 		if (IS_ERR(breakinfo[i].pev)) {
-			printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
+			printk(KERN_ERR "kgdb: Could not allocate hw"
+			       "breakpoints\nDisabling the kernel debugger\n");
 			breakinfo[i].pev = NULL;
 			kgdb_arch_exit();
-			return -1;
+			return;
 		}
 		for_each_online_cpu(cpu) {
 			pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -631,7 +635,6 @@ int kgdb_arch_init(void)
 			}
 		}
 	}
-	return ret;
 }
 
 /**
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 6c784ab6856..9340f34d1bb 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -207,6 +207,17 @@ extern int kgdb_validate_break_address(unsigned long addr);
 extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr);
 extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle);
 
+/**
+ *	kgdb_arch_late - Perform any architecture specific initalization.
+ *
+ *	This function will handle the late initalization of any
+ *	architecture specific callbacks.  This is an optional function for
+ *	handling things like late initialization of hw breakpoints.  The
+ *	default implementation does nothing.
+ */
+extern void kgdb_arch_late(void);
+
+
 /**
  * struct kgdb_arch - Describe architecture specific values.
  * @gdb_bpt_instr: The instruction to trigger a breakpoint.
@@ -285,7 +296,10 @@ extern int			kgdb_single_step;
 extern atomic_t			kgdb_active;
 #define in_dbg_master() \
 	(raw_smp_processor_id() == atomic_read(&kgdb_active))
+extern bool dbg_is_early;
+extern void __init dbg_late_init(void);
 #else /* ! CONFIG_KGDB */
 #define in_dbg_master() (0)
+#define dbg_late_init()
 #endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/init/main.c b/init/main.c
index 372771333d9..22881b5e95e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -62,7 +62,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
-#include <linux/kdb.h>
+#include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
@@ -676,7 +676,7 @@ asmlinkage void __init start_kernel(void)
 	buffer_init();
 	key_init();
 	security_init();
-	kdb_init(KDB_INIT_FULL);
+	dbg_late_init();
 	vfs_caches_init(totalram_pages);
 	signals_init();
 	/* rootfs populating might need page-writeback */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 64b5588c963..5cb7cd1de10 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -78,6 +78,8 @@ static DEFINE_SPINLOCK(kgdb_registration_lock);
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
 static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
 /* Next cpu to become the master debug core */
 int dbg_switch_cpu;
 
@@ -777,11 +779,25 @@ static struct notifier_block kgdb_panic_event_nb = {
        .priority	= INT_MAX,
 };
 
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+	dbg_is_early = false;
+	if (kgdb_io_module_registered)
+		kgdb_arch_late();
+	kdb_init(KDB_INIT_FULL);
+}
+
 static void kgdb_register_callbacks(void)
 {
 	if (!kgdb_io_module_registered) {
 		kgdb_io_module_registered = 1;
 		kgdb_arch_init();
+		if (!dbg_is_early)
+			kgdb_arch_late();
 		atomic_notifier_chain_register(&panic_notifier_list,
 					       &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
-- 
cgit v1.2.3-70-g09d2


From 031acd8c42edd61070f81c51edc89e83147a3d0e Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 May 2010 21:04:30 -0500
Subject: x86,kgdb: Implement early hardware breakpoint debugging

It is not possible to use the hw_breakpoint.c API prior to mm_init(),
but it is possible to use hardware breakpoints with the kernel
debugger.

Prior to smp_init() it is possible to simply write to the dr registers
of the boot cpu directly.  This can be used up until the
kgdb_arch_late() is invoked, at which point the standard hw_breakpoint.c
API will get used.

CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 2b71ec41869..4f4af75b948 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -199,6 +199,8 @@ static struct hw_breakpoint {
 	struct perf_event	**pev;
 } breakinfo[4];
 
+static unsigned long early_dr7;
+
 static void kgdb_correct_hw_break(void)
 {
 	int breakno;
@@ -210,6 +212,14 @@ static void kgdb_correct_hw_break(void)
 		int cpu = raw_smp_processor_id();
 		if (!breakinfo[breakno].enabled)
 			continue;
+		if (dbg_is_early) {
+			set_debugreg(breakinfo[breakno].addr, breakno);
+			early_dr7 |= encode_dr7(breakno,
+						breakinfo[breakno].len,
+						breakinfo[breakno].type);
+			set_debugreg(early_dr7, 7);
+			continue;
+		}
 		bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
 		info = counter_arch_bp(bp);
 		if (bp->attr.disabled != 1)
@@ -224,7 +234,8 @@ static void kgdb_correct_hw_break(void)
 		if (!val)
 			bp->attr.disabled = 0;
 	}
-	hw_breakpoint_restore();
+	if (!dbg_is_early)
+		hw_breakpoint_restore();
 }
 
 static int hw_break_reserve_slot(int breakno)
@@ -233,6 +244,9 @@ static int hw_break_reserve_slot(int breakno)
 	int cnt = 0;
 	struct perf_event **pevent;
 
+	if (dbg_is_early)
+		return 0;
+
 	for_each_online_cpu(cpu) {
 		cnt++;
 		pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
@@ -258,6 +272,9 @@ static int hw_break_release_slot(int breakno)
 	struct perf_event **pevent;
 	int cpu;
 
+	if (dbg_is_early)
+		return 0;
+
 	for_each_online_cpu(cpu) {
 		pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
 		if (dbg_release_bp_slot(*pevent))
@@ -302,7 +319,11 @@ static void kgdb_remove_all_hw_break(void)
 		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
 		if (bp->attr.disabled == 1)
 			continue;
-		arch_uninstall_hw_breakpoint(bp);
+		if (dbg_is_early)
+			early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+						 breakinfo[i].type);
+		else
+			arch_uninstall_hw_breakpoint(bp);
 		bp->attr.disabled = 1;
 	}
 }
@@ -379,6 +400,11 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
 	for (i = 0; i < 4; i++) {
 		if (!breakinfo[i].enabled)
 			continue;
+		if (dbg_is_early) {
+			early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+						 breakinfo[i].type);
+			continue;
+		}
 		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
 		if (bp->attr.disabled == 1)
 			continue;
-- 
cgit v1.2.3-70-g09d2


From 0bb9fef9134cf4fdcfce02f9adc22d3d0725cc29 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 May 2010 21:04:30 -0500
Subject: x86,early dr regs,kgdb: Allow kernel debugger early dr register
 access

If the kernel debugger was configured, attached and started with
kgdbwait, the hardware breakpoint registers should get restored by the
kgdb code which is managing the dr registers.

CC: x86@kernel.org
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1c00d0b169..cc83a002786 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1084,6 +1084,20 @@ static void clear_all_debug_regs(void)
 	}
 }
 
+#ifdef CONFIG_KGDB
+/*
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
+ */
+static void dbg_restore_debug_regs(void)
+{
+	if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+		arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1174,18 +1188,8 @@ void __cpuinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
-#ifdef CONFIG_KGDB
-	/*
-	 * If the kgdb is connected no debug regs should be altered.  This
-	 * is only applicable when KGDB and a KGDB I/O module are built
-	 * into the kernel and you are using early debugging with
-	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
-	 */
-	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
-		arch_kgdb_ops.correct_hw_break();
-	else
-#endif
-		clear_all_debug_regs();
+	clear_all_debug_regs();
+	dbg_restore_debug_regs();
 
 	fpu_init();
 
@@ -1239,6 +1243,7 @@ void __cpuinit cpu_init(void)
 #endif
 
 	clear_all_debug_regs();
+	dbg_restore_debug_regs();
 
 	/*
 	 * Force FPU initialization:
-- 
cgit v1.2.3-70-g09d2


From 61eaf539b9fb4926ed57e38f6545100c3432cf1b Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 May 2010 21:04:31 -0500
Subject: earlyprintk,vga,kdb: Fix \b and \r for earlyprintk=vga with kdb

Allow kdb to work properly with with earlyprintk=vga by interpreting
the backspace and carriage return output characters.  These
interpretation of these characters is used for simple line editing
provided in the kdb shell.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: x86@kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/early_printk.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b9c830c12b4..fa99bae75ac 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n)
 				writew(0x720, VGABASE + 2*(max_xpos*j + i));
 			current_ypos = max_ypos-1;
 		}
+#ifdef CONFIG_KGDB_KDB
+		if (c == '\b') {
+			if (current_xpos > 0)
+				current_xpos--;
+		} else if (c == '\r') {
+			current_xpos = 0;
+		} else
+#endif
 		if (c == '\n') {
 			current_xpos = 0;
 			current_ypos++;
-- 
cgit v1.2.3-70-g09d2