summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kernel')
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/powerpc/kernel/cputable.c31
-rw-r--r--arch/powerpc/kernel/entry_32.S6
-rw-r--r--arch/powerpc/kernel/head_fsl_booke.S187
-rw-r--r--arch/powerpc/kernel/idle.c2
-rw-r--r--arch/powerpc/kernel/iommu.c35
-rw-r--r--arch/powerpc/kernel/kgdb.c410
-rw-r--r--arch/powerpc/kernel/kprobes.c6
-rw-r--r--arch/powerpc/kernel/lparcfg.c386
-rw-r--r--arch/powerpc/kernel/pci-common.c1
-rw-r--r--arch/powerpc/kernel/process.c46
-rw-r--r--arch/powerpc/kernel/prom_init.c9
-rw-r--r--arch/powerpc/kernel/prom_parse.c44
-rw-r--r--arch/powerpc/kernel/ptrace.c72
-rw-r--r--arch/powerpc/kernel/setup_32.c16
-rw-r--r--arch/powerpc/kernel/signal.c6
-rw-r--r--arch/powerpc/kernel/stacktrace.c2
-rw-r--r--arch/powerpc/kernel/suspend.c1
-rw-r--r--arch/powerpc/kernel/sysfs.c18
-rw-r--r--arch/powerpc/kernel/traps.c16
-rw-r--r--arch/powerpc/kernel/vio.c1033
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S31
22 files changed, 2020 insertions, 339 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index bf0b1fd0ec3..1a4094704b1 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -74,6 +74,7 @@ obj-y += time.o prom.o traps.o setup-common.o \
misc_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC32) += entry_32.o setup_32.o
obj-$(CONFIG_PPC64) += dma_64.o iommu.o
+obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_PPC_MULTIPLATFORM) += prom_init.o
obj-$(CONFIG_MODULES) += ppc_ksyms.o
obj-$(CONFIG_BOOTX_TEXT) += btext.o
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index f7f3c215d06..25c273c761d 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -23,6 +23,9 @@
struct cpu_spec* cur_cpu_spec = NULL;
EXPORT_SYMBOL(cur_cpu_spec);
+/* The platform string corresponding to the real PVR */
+const char *powerpc_base_platform;
+
/* NOTE:
* Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
* the responsibility of the appropriate CPU save/restore functions to
@@ -355,6 +358,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.icache_bsize = 128,
.dcache_bsize = 128,
.machine_check = machine_check_generic,
+ .oprofile_cpu_type = "ppc64/compat-power5+",
.platform = "power5+",
},
{ /* Power6 */
@@ -386,6 +390,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.icache_bsize = 128,
.dcache_bsize = 128,
.machine_check = machine_check_generic,
+ .oprofile_cpu_type = "ppc64/compat-power6",
.platform = "power6",
},
{ /* 2.06-compliant processor, i.e. Power7 "architected" mode */
@@ -397,6 +402,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.icache_bsize = 128,
.dcache_bsize = 128,
.machine_check = machine_check_generic,
+ .oprofile_cpu_type = "ppc64/compat-power7",
.platform = "power7",
},
{ /* Power7 */
@@ -1629,9 +1635,34 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
t->cpu_setup = s->cpu_setup;
t->cpu_restore = s->cpu_restore;
t->platform = s->platform;
+ /*
+ * If we have passed through this logic once
+ * before and have pulled the default case
+ * because the real PVR was not found inside
+ * cpu_specs[], then we are possibly running in
+ * compatibility mode. In that case, let the
+ * oprofiler know which set of compatibility
+ * counters to pull from by making sure the
+ * oprofile_cpu_type string is set to that of
+ * compatibility mode. If the oprofile_cpu_type
+ * already has a value, then we are possibly
+ * overriding a real PVR with a logical one, and,
+ * in that case, keep the current value for
+ * oprofile_cpu_type.
+ */
+ if (t->oprofile_cpu_type == NULL)
+ t->oprofile_cpu_type = s->oprofile_cpu_type;
} else
*t = *s;
*PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
+
+ /*
+ * Set the base platform string once; assumes
+ * we're called with real pvr first.
+ */
+ if (*PTRRELOC(&powerpc_base_platform) == NULL)
+ *PTRRELOC(&powerpc_base_platform) = t->platform;
+
#if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
/* ppc64 and booke expect identify_cpu to also call
* setup_cpu for that processor. I will consolidate
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index da52269aec1..81c8324a4a3 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -148,7 +148,7 @@ transfer_to_handler:
/* Check to see if the dbcr0 register is set up to debug. Use the
internal debug mode bit to do this. */
lwz r12,THREAD_DBCR0(r12)
- andis. r12,r12,DBCR0_IDM@h
+ andis. r12,r12,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h
beq+ 3f
/* From user and task is ptraced - load up global dbcr0 */
li r12,-1 /* clear all pending debug events */
@@ -292,7 +292,7 @@ syscall_exit_cont:
/* If the process has its own DBCR0 value, load it up. The internal
debug mode bit tells us that dbcr0 should be loaded. */
lwz r0,THREAD+THREAD_DBCR0(r2)
- andis. r10,r0,DBCR0_IDM@h
+ andis. r10,r0,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h
bnel- load_dbcr0
#endif
#ifdef CONFIG_44x
@@ -720,7 +720,7 @@ restore_user:
/* Check whether this process has its own DBCR0 value. The internal
debug mode bit tells us that dbcr0 should be loaded. */
lwz r0,THREAD+THREAD_DBCR0(r2)
- andis. r10,r0,DBCR0_IDM@h
+ andis. r10,r0,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h
bnel- load_dbcr0
#endif
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index c4268500e85..3cb52fa0eda 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -151,16 +151,11 @@ skpinv: addi r6,r6,1 /* Increment */
/* Invalidate TLB0 */
li r6,0x04
tlbivax 0,r6
-#ifdef CONFIG_SMP
- tlbsync
-#endif
+ TLBSYNC
/* Invalidate TLB1 */
li r6,0x0c
tlbivax 0,r6
-#ifdef CONFIG_SMP
- tlbsync
-#endif
- msync
+ TLBSYNC
/* 3. Setup a temp mapping and jump to it */
andi. r5, r3, 0x1 /* Find an entry not used and is non-zero */
@@ -238,10 +233,7 @@ skpinv: addi r6,r6,1 /* Increment */
/* Invalidate TLB1 */
li r9,0x0c
tlbivax 0,r9
-#ifdef CONFIG_SMP
- tlbsync
-#endif
- msync
+ TLBSYNC
/* 6. Setup KERNELBASE mapping in TLB1[0] */
lis r6,0x1000 /* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */
@@ -283,10 +275,7 @@ skpinv: addi r6,r6,1 /* Increment */
/* Invalidate TLB1 */
li r9,0x0c
tlbivax 0,r9
-#ifdef CONFIG_SMP
- tlbsync
-#endif
- msync
+ TLBSYNC
/* Establish the interrupt vector offsets */
SET_IVOR(0, CriticalInput);
@@ -483,90 +472,16 @@ interrupt_base:
/* Data Storage Interrupt */
START_EXCEPTION(DataStorage)
- mtspr SPRN_SPRG0, r10 /* Save some working registers */
- mtspr SPRN_SPRG1, r11
- mtspr SPRN_SPRG4W, r12
- mtspr SPRN_SPRG5W, r13
- mfcr r11
- mtspr SPRN_SPRG7W, r11
-
- /*
- * Check if it was a store fault, if not then bail
- * because a user tried to access a kernel or
- * read-protected page. Otherwise, get the
- * offending address and handle it.
- */
- mfspr r10, SPRN_ESR
- andis. r10, r10, ESR_ST@h
- beq 2f
-
- mfspr r10, SPRN_DEAR /* Get faulting address */
-
- /* If we are faulting a kernel address, we have to use the
- * kernel page tables.
- */
- lis r11, PAGE_OFFSET@h
- cmplw 0, r10, r11
- bge 2f
-
- /* Get the PGD for the current thread */
-3:
- mfspr r11,SPRN_SPRG3
- lwz r11,PGDIR(r11)
-4:
- FIND_PTE
-
- /* Are _PAGE_USER & _PAGE_RW set & _PAGE_HWWRITE not? */
- andi. r13, r11, _PAGE_RW|_PAGE_USER|_PAGE_HWWRITE
- cmpwi 0, r13, _PAGE_RW|_PAGE_USER
- bne 2f /* Bail if not */
-
- /* Update 'changed'. */
- ori r11, r11, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
- stw r11, PTE_FLAGS_OFFSET(r12) /* Update Linux page table */
-
- /* MAS2 not updated as the entry does exist in the tlb, this
- fault taken to detect state transition (eg: COW -> DIRTY)
- */
- andi. r11, r11, _PAGE_HWEXEC
- rlwimi r11, r11, 31, 27, 27 /* SX <- _PAGE_HWEXEC */
- ori r11, r11, (MAS3_UW|MAS3_SW|MAS3_UR|MAS3_SR)@l /* set static perms */
-
- /* update search PID in MAS6, AS = 0 */
- mfspr r12, SPRN_PID0
- slwi r12, r12, 16
- mtspr SPRN_MAS6, r12
-
- /* find the TLB index that caused the fault. It has to be here. */
- tlbsx 0, r10
-
- /* only update the perm bits, assume the RPN is fine */
- mfspr r12, SPRN_MAS3
- rlwimi r12, r11, 0, 20, 31
- mtspr SPRN_MAS3,r12
- tlbwe
-
- /* Done...restore registers and get out of here. */
- mfspr r11, SPRN_SPRG7R
- mtcr r11
- mfspr r13, SPRN_SPRG5R
- mfspr r12, SPRN_SPRG4R
- mfspr r11, SPRN_SPRG1
- mfspr r10, SPRN_SPRG0
- rfi /* Force context change */
-
-2:
- /*
- * The bailout. Restore registers to pre-exception conditions
- * and call the heavyweights to help us out.
- */
- mfspr r11, SPRN_SPRG7R
- mtcr r11
- mfspr r13, SPRN_SPRG5R
- mfspr r12, SPRN_SPRG4R
- mfspr r11, SPRN_SPRG1
- mfspr r10, SPRN_SPRG0
- b data_access
+ NORMAL_EXCEPTION_PROLOG
+ mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */
+ stw r5,_ESR(r11)
+ mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */
+ andis. r10,r5,(ESR_ILK|ESR_DLK)@h
+ bne 1f
+ EXC_XFER_EE_LITE(0x0300, handle_page_fault)
+1:
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ EXC_XFER_EE_LITE(0x0300, CacheLockingException)
/* Instruction Storage Interrupt */
INSTRUCTION_STORAGE_EXCEPTION
@@ -645,15 +560,30 @@ interrupt_base:
lwz r11,PGDIR(r11)
4:
+ /* Mask of required permission bits. Note that while we
+ * do copy ESR:ST to _PAGE_RW position as trying to write
+ * to an RO page is pretty common, we don't do it with
+ * _PAGE_DIRTY. We could do it, but it's a fairly rare
+ * event so I'd rather take the overhead when it happens
+ * rather than adding an instruction here. We should measure
+ * whether the whole thing is worth it in the first place
+ * as we could avoid loading SPRN_ESR completely in the first
+ * place...
+ *
+ * TODO: Is it worth doing that mfspr & rlwimi in the first
+ * place or can we save a couple of instructions here ?
+ */
+ mfspr r12,SPRN_ESR
+ li r13,_PAGE_PRESENT|_PAGE_ACCESSED
+ rlwimi r13,r12,11,29,29
+
FIND_PTE
- andi. r13, r11, _PAGE_PRESENT /* Is the page present? */
- beq 2f /* Bail if not present */
+ andc. r13,r13,r11 /* Check permission */
+ bne 2f /* Bail if permission mismach */
#ifdef CONFIG_PTE_64BIT
lwz r13, 0(r12)
#endif
- ori r11, r11, _PAGE_ACCESSED
- stw r11, PTE_FLAGS_OFFSET(r12)
/* Jump to common tlb load */
b finish_tlb_load
@@ -667,7 +597,7 @@ interrupt_base:
mfspr r12, SPRN_SPRG4R
mfspr r11, SPRN_SPRG1
mfspr r10, SPRN_SPRG0
- b data_access
+ b DataStorage
/* Instruction TLB Error Interrupt */
/*
@@ -705,15 +635,16 @@ interrupt_base:
lwz r11,PGDIR(r11)
4:
+ /* Make up the required permissions */
+ li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_HWEXEC
+
FIND_PTE
- andi. r13, r11, _PAGE_PRESENT /* Is the page present? */
- beq 2f /* Bail if not present */
+ andc. r13,r13,r11 /* Check permission */
+ bne 2f /* Bail if permission mismach */
#ifdef CONFIG_PTE_64BIT
lwz r13, 0(r12)
#endif
- ori r11, r11, _PAGE_ACCESSED
- stw r11, PTE_FLAGS_OFFSET(r12)
/* Jump to common TLB load point */
b finish_tlb_load
@@ -768,29 +699,13 @@ interrupt_base:
* Local functions
*/
- /*
- * Data TLB exceptions will bail out to this point
- * if they can't resolve the lightweight TLB fault.
- */
-data_access:
- NORMAL_EXCEPTION_PROLOG
- mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */
- stw r5,_ESR(r11)
- mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */
- andis. r10,r5,(ESR_ILK|ESR_DLK)@h
- bne 1f
- EXC_XFER_EE_LITE(0x0300, handle_page_fault)
-1:
- addi r3,r1,STACK_FRAME_OVERHEAD
- EXC_XFER_EE_LITE(0x0300, CacheLockingException)
-
/*
-
* Both the instruction and data TLB miss get to this
* point to load the TLB.
* r10 - EA of fault
* r11 - TLB (info from Linux PTE)
- * r12, r13 - available to use
+ * r12 - available to use
+ * r13 - upper bits of PTE (if PTE_64BIT) or available to use
* CR5 - results of addr >= PAGE_OFFSET
* MAS0, MAS1 - loaded with proper value when we get here
* MAS2, MAS3 - will need additional info from Linux PTE
@@ -812,20 +727,14 @@ finish_tlb_load:
#endif
mtspr SPRN_MAS2, r12
- bge 5, 1f
-
- /* is user addr */
- andi. r12, r11, (_PAGE_USER | _PAGE_HWWRITE | _PAGE_HWEXEC)
+ li r10, (_PAGE_HWEXEC | _PAGE_PRESENT)
+ rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */
+ and r12, r11, r10
andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */
- srwi r10, r12, 1
- or r12, r12, r10 /* Copy user perms into supervisor */
- iseleq r12, 0, r12
- b 2f
-
- /* is kernel addr */
-1: rlwinm r12, r11, 31, 29, 29 /* Extract _PAGE_HWWRITE into SW */
- ori r12, r12, (MAS3_SX | MAS3_SR)
-
+ slwi r10, r12, 1
+ or r10, r10, r12
+ iseleq r12, r12, r10
+
#ifdef CONFIG_PTE_64BIT
2: rlwimi r12, r13, 24, 0, 7 /* grab RPN[32:39] */
rlwimi r12, r11, 24, 8, 19 /* grab RPN[40:51] */
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index c3cf0e8f3ac..d308a9f70f1 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -60,7 +60,7 @@ void cpu_idle(void)
set_thread_flag(TIF_POLLING_NRFLAG);
while (1) {
- tick_nohz_stop_sched_tick();
+ tick_nohz_stop_sched_tick(1);
while (!need_resched() && !cpu_should_die()) {
ppc64_runlatch_off();
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 8c68ee9e5d1..550a19399bf 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -49,6 +49,8 @@ static int novmerge = 1;
static int protect4gb = 1;
+static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
+
static inline unsigned long iommu_num_pages(unsigned long vaddr,
unsigned long slen)
{
@@ -186,10 +188,12 @@ static unsigned long iommu_range_alloc(struct device *dev,
static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
void *page, unsigned int npages,
enum dma_data_direction direction,
- unsigned long mask, unsigned int align_order)
+ unsigned long mask, unsigned int align_order,
+ struct dma_attrs *attrs)
{
unsigned long entry, flags;
dma_addr_t ret = DMA_ERROR_CODE;
+ int build_fail;
spin_lock_irqsave(&(tbl->it_lock), flags);
@@ -204,9 +208,21 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */
/* Put the TCEs in the HW table */
- ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
- direction);
+ build_fail = ppc_md.tce_build(tbl, entry, npages,
+ (unsigned long)page & IOMMU_PAGE_MASK,
+ direction, attrs);
+
+ /* ppc_md.tce_build() only returns non-zero for transient errors.
+ * Clean up the table bitmap in this case and return
+ * DMA_ERROR_CODE. For all other errors the functionality is
+ * not altered.
+ */
+ if (unlikely(build_fail)) {
+ __iommu_free(tbl, ret, npages);
+ spin_unlock_irqrestore(&(tbl->it_lock), flags);
+ return DMA_ERROR_CODE;
+ }
/* Flush/invalidate TLB caches if necessary */
if (ppc_md.tce_flush)
@@ -275,7 +291,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
dma_addr_t dma_next = 0, dma_addr;
unsigned long flags;
struct scatterlist *s, *outs, *segstart;
- int outcount, incount, i;
+ int outcount, incount, i, build_fail = 0;
unsigned int align;
unsigned long handle;
unsigned int max_seg_size;
@@ -336,7 +352,11 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
npages, entry, dma_addr);
/* Insert into HW table */
- ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction);
+ build_fail = ppc_md.tce_build(tbl, entry, npages,
+ vaddr & IOMMU_PAGE_MASK,
+ direction, attrs);
+ if(unlikely(build_fail))
+ goto failure;
/* If we are in an open segment, try merging */
if (segstart != s) {
@@ -573,7 +593,8 @@ dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl,
align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
- mask >> IOMMU_PAGE_SHIFT, align);
+ mask >> IOMMU_PAGE_SHIFT, align,
+ attrs);
if (dma_handle == DMA_ERROR_CODE) {
if (printk_ratelimit()) {
printk(KERN_INFO "iommu_alloc failed, "
@@ -642,7 +663,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
nio_pages = size >> IOMMU_PAGE_SHIFT;
io_order = get_iommu_order(size);
mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
- mask >> IOMMU_PAGE_SHIFT, io_order);
+ mask >> IOMMU_PAGE_SHIFT, io_order, NULL);
if (mapping == DMA_ERROR_CODE) {
free_pages((unsigned long)ret, order);
return NULL;
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
new file mode 100644
index 00000000000..b4fdf2f2743
--- /dev/null
+++ b/arch/powerpc/kernel/kgdb.c
@@ -0,0 +1,410 @@
+/*
+ * PowerPC backend to the KGDB stub.
+ *
+ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu)
+ * Copyright (C) 2003 Timesys Corporation.
+ * Copyright (C) 2004-2006 MontaVista Software, Inc.
+ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com)
+ * PPC32 support restored by Vitaly Wool <vwool@ru.mvista.com> and
+ * Sergei Shtylyov <sshtylyov@ru.mvista.com>
+ * Copyright (C) 2007-2008 Wind River Systems, Inc.
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program as licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/signal.h>
+#include <linux/ptrace.h>
+#include <asm/current.h>
+#include <asm/processor.h>
+#include <asm/machdep.h>
+
+/*
+ * This table contains the mapping between PowerPC hardware trap types, and
+ * signals, which are primarily what GDB understands. GDB and the kernel
+ * don't always agree on values, so we use constants taken from gdb-6.2.
+ */
+static struct hard_trap_info
+{
+ unsigned int tt; /* Trap type code for powerpc */
+ unsigned char signo; /* Signal that we map this trap into */
+} hard_trap_info[] = {
+ { 0x0100, 0x02 /* SIGINT */ }, /* system reset */
+ { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */
+ { 0x0300, 0x0b /* SIGSEGV */ }, /* data access */
+ { 0x0400, 0x0b /* SIGSEGV */ }, /* instruction access */
+ { 0x0500, 0x02 /* SIGINT */ }, /* external interrupt */
+ { 0x0600, 0x0a /* SIGBUS */ }, /* alignment */
+ { 0x0700, 0x05 /* SIGTRAP */ }, /* program check */
+ { 0x0800, 0x08 /* SIGFPE */ }, /* fp unavailable */
+ { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */
+ { 0x0c00, 0x14 /* SIGCHLD */ }, /* system call */
+#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+ { 0x2002, 0x05 /* SIGTRAP */ }, /* debug */
+#if defined(CONFIG_FSL_BOOKE)
+ { 0x2010, 0x08 /* SIGFPE */ }, /* spe unavailable */
+ { 0x2020, 0x08 /* SIGFPE */ }, /* spe unavailable */
+ { 0x2030, 0x08 /* SIGFPE */ }, /* spe fp data */
+ { 0x2040, 0x08 /* SIGFPE */ }, /* spe fp data */
+ { 0x2050, 0x08 /* SIGFPE */ }, /* spe fp round */
+ { 0x2060, 0x0e /* SIGILL */ }, /* performace monitor */
+ { 0x2900, 0x08 /* SIGFPE */ }, /* apu unavailable */
+ { 0x3100, 0x0e /* SIGALRM */ }, /* fixed interval timer */
+ { 0x3200, 0x02 /* SIGINT */ }, /* watchdog */
+#else /* ! CONFIG_FSL_BOOKE */
+ { 0x1000, 0x0e /* SIGALRM */ }, /* prog interval timer */
+ { 0x1010, 0x0e /* SIGALRM */ }, /* fixed interval timer */
+ { 0x1020, 0x02 /* SIGINT */ }, /* watchdog */
+ { 0x2010, 0x08 /* SIGFPE */ }, /* fp unavailable */
+ { 0x2020, 0x08 /* SIGFPE */ }, /* ap unavailable */
+#endif
+#else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */
+ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */
+#if defined(CONFIG_8xx)
+ { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */
+#else /* ! CONFIG_8xx */
+ { 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */
+ { 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */
+ { 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */
+#if defined(CONFIG_PPC64)
+ { 0x1200, 0x05 /* SIGILL */ }, /* system error */
+ { 0x1500, 0x04 /* SIGILL */ }, /* soft patch */
+ { 0x1600, 0x04 /* SIGILL */ }, /* maintenance */
+ { 0x1700, 0x08 /* SIGFPE */ }, /* altivec assist */
+ { 0x1800, 0x04 /* SIGILL */ }, /* thermal */
+#else /* ! CONFIG_PPC64 */
+ { 0x1400, 0x02 /* SIGINT */ }, /* SMI */
+ { 0x1600, 0x08 /* SIGFPE */ }, /* altivec assist */
+ { 0x1700, 0x04 /* SIGILL */ }, /* TAU */
+ { 0x2000, 0x05 /* SIGTRAP */ }, /* run mode */
+#endif
+#endif
+#endif
+ { 0x0000, 0x00 } /* Must be last */
+};
+
+static int computeSignal(unsigned int tt)
+{
+ struct hard_trap_info *ht;
+
+ for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+ if (ht->tt == tt)
+ return ht->signo;
+
+ return SIGHUP; /* default for things we don't know about */
+}
+
+static int kgdb_call_nmi_hook(struct pt_regs *regs)
+{
+ kgdb_nmicallback(raw_smp_processor_id(), regs);
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+void kgdb_roundup_cpus(unsigned long flags)
+{
+ smp_send_debugger_break(MSG_ALL_BUT_SELF);
+}
+#endif
+
+/* KGDB functions to use existing PowerPC64 hooks. */
+static int kgdb_debugger(struct pt_regs *regs)
+{
+ return kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
+}
+
+static int kgdb_handle_breakpoint(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return 0;
+
+ if (kgdb_handle_exception(0, SIGTRAP, 0, regs) != 0)
+ return 0;
+
+ if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr))
+ regs->nip += 4;
+
+ return 1;
+}
+
+static int kgdb_singlestep(struct pt_regs *regs)
+{
+ struct thread_info *thread_info, *exception_thread_info;
+
+ if (user_mode(regs))
+ return 0;
+
+ /*
+ * On Book E and perhaps other processsors, singlestep is handled on
+ * the critical exception stack. This causes current_thread_info()
+ * to fail, since it it locates the thread_info by masking off
+ * the low bits of the current stack pointer. We work around
+ * this issue by copying the thread_info from the kernel stack
+ * before calling kgdb_handle_exception, and copying it back
+ * afterwards. On most processors the copy is avoided since
+ * exception_thread_info == thread_info.
+ */
+ thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
+ exception_thread_info = current_thread_info();
+
+ if (thread_info != exception_thread_info)
+ memcpy(exception_thread_info, thread_info, sizeof *thread_info);
+
+ kgdb_handle_exception(0, SIGTRAP, 0, regs);
+
+ if (thread_info != exception_thread_info)
+ memcpy(thread_info, exception_thread_info, sizeof *thread_info);
+
+ return 1;
+}
+
+static int kgdb_iabr_match(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return 0;
+
+ if (kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs) != 0)
+ return 0;
+ return 1;
+}
+
+static int kgdb_dabr_match(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return 0;
+
+ if (kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs) != 0)
+ return 0;
+ return 1;
+}
+
+#define PACK64(ptr, src) do { *(ptr++) = (src); } while (0)
+
+#define PACK32(ptr, src) do { \
+ u32 *ptr32; \
+ ptr32 = (u32 *)ptr; \
+ *(ptr32++) = (src); \
+ ptr = (unsigned long *)ptr32; \
+ } while (0)
+
+
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ unsigned long *ptr = gdb_regs;
+ int reg;
+
+ memset(gdb_regs, 0, NUMREGBYTES);
+
+ for (reg = 0; reg < 32; reg++)
+ PACK64(ptr, regs->gpr[reg]);
+
+#ifdef CONFIG_FSL_BOOKE
+#ifdef CONFIG_SPE
+ for (reg = 0; reg < 32; reg++)
+ PACK64(ptr, current->thread.evr[reg]);
+#else
+ ptr += 32;
+#endif
+#else
+ /* fp registers not used by kernel, leave zero */
+ ptr += 32 * 8 / sizeof(long);
+#endif
+
+ PACK64(ptr, regs->nip);
+ PACK64(ptr, regs->msr);
+ PACK32(ptr, regs->ccr);
+ PACK64(ptr, regs->link);
+ PACK64(ptr, regs->ctr);
+ PACK32(ptr, regs->xer);
+
+ BUG_ON((unsigned long)ptr >
+ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
+}
+
+void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
+{
+ struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp +
+ STACK_FRAME_OVERHEAD);
+ unsigned long *ptr = gdb_regs;
+ int reg;
+
+ memset(gdb_regs, 0, NUMREGBYTES);
+
+ /* Regs GPR0-2 */
+ for (reg = 0; reg < 3; reg++)
+ PACK64(ptr, regs->gpr[reg]);
+
+ /* Regs GPR3-13 are caller saved, not in regs->gpr[] */
+ ptr += 11;
+
+ /* Regs GPR14-31 */
+ for (reg = 14; reg < 32; reg++)
+ PACK64(ptr, regs->gpr[reg]);
+
+#ifdef CONFIG_FSL_BOOKE
+#ifdef CONFIG_SPE
+ for (reg = 0; reg < 32; reg++)
+ PACK64(ptr, p->thread.evr[reg]);
+#else
+ ptr += 32;
+#endif
+#else
+ /* fp registers not used by kernel, leave zero */
+ ptr += 32 * 8 / sizeof(long);
+#endif
+
+ PACK64(ptr, regs->nip);
+ PACK64(ptr, regs->msr);
+ PACK32(ptr, regs->ccr);
+ PACK64(ptr, regs->link);
+ PACK64(ptr, regs->ctr);
+ PACK32(ptr, regs->xer);
+
+ BUG_ON((unsigned long)ptr >
+ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
+}
+
+#define UNPACK64(dest, ptr) do { dest = *(ptr++); } while (0)
+
+#define UNPACK32(dest, ptr) do { \
+ u32 *ptr32; \
+ ptr32 = (u32 *)ptr; \
+ dest = *(ptr32++); \
+ ptr = (unsigned long *)ptr32; \
+ } while (0)
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ unsigned long *ptr = gdb_regs;
+ int reg;
+#ifdef CONFIG_SPE
+ union {
+ u32 v32[2];
+ u64 v64;
+ } acc;
+#endif
+
+ for (reg = 0; reg < 32; reg++)
+ UNPACK64(regs->gpr[reg], ptr);
+
+#ifdef CONFIG_FSL_BOOKE
+#ifdef CONFIG_SPE
+ for (reg = 0; reg < 32; reg++)
+ UNPACK64(current->thread.evr[reg], ptr);
+#else
+ ptr += 32;
+#endif
+#else
+ /* fp registers not used by kernel, leave zero */
+ ptr += 32 * 8 / sizeof(int);
+#endif
+
+ UNPACK64(regs->nip, ptr);
+ UNPACK64(regs->msr, ptr);
+ UNPACK32(regs->ccr, ptr);
+ UNPACK64(regs->link, ptr);
+ UNPACK64(regs->ctr, ptr);
+ UNPACK32(regs->xer, ptr);
+
+ BUG_ON((unsigned long)ptr >
+ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
+}
+
+/*
+ * This function does PowerPC specific procesing for interfacing to gdb.
+ */
+int kgdb_arch_handle_exception(int vector, int signo, int err_code,
+ char *remcom_in_buffer, char *remcom_out_buffer,
+ struct pt_regs *linux_regs)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long addr;
+
+ switch (remcom_in_buffer[0]) {
+ /*
+ * sAA..AA Step one instruction from AA..AA
+ * This will return an error to gdb ..
+ */
+ case 's':
+ case 'c':
+ /* handle the optional parameter */
+ if (kgdb_hex2long(&ptr, &addr))
+ linux_regs->nip = addr;
+
+ atomic_set(&kgdb_cpu_doing_single_step, -1);
+ /* set the trace bit if we're stepping */
+ if (remcom_in_buffer[0] == 's') {
+#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+ mtspr(SPRN_DBCR0,
+ mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
+ linux_regs->msr |= MSR_DE;
+#else
+ linux_regs->msr |= MSR_SE;
+#endif
+ kgdb_single_step = 1;
+ if (kgdb_contthread)
+ atomic_set(&kgdb_cpu_doing_single_step,
+ raw_smp_processor_id());
+ }
+ return 0;
+ }
+
+ return -1;
+}
+
+/*
+ * Global data
+ */
+struct kgdb_arch arch_kgdb_ops = {
+ .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
+};
+
+static int kgdb_not_implemented(struct pt_regs *regs)
+{
+ return 0;
+}
+
+static void *old__debugger_ipi;
+static void *old__debugger;
+static void *old__debugger_bpt;
+static void *old__debugger_sstep;
+static void *old__debugger_iabr_match;
+static void *old__debugger_dabr_match;
+static void *old__debugger_fault_handler;
+
+int kgdb_arch_init(void)
+{
+ old__debugger_ipi = __debugger_ipi;
+ old__debugger = __debugger;
+ old__debugger_bpt = __debugger_bpt;
+ old__debugger_sstep = __debugger_sstep;
+ old__debugger_iabr_match = __debugger_iabr_match;
+ old__debugger_dabr_match = __debugger_dabr_match;
+ old__debugger_fault_handler = __debugger_fault_handler;
+
+ __debugger_ipi = kgdb_call_nmi_hook;
+ __debugger = kgdb_debugger;
+ __debugger_bpt = kgdb_handle_breakpoint;
+ __debugger_sstep = kgdb_singlestep;
+ __debugger_iabr_match = kgdb_iabr_match;
+ __debugger_dabr_match = kgdb_dabr_match;
+ __debugger_fault_handler = kgdb_not_implemented;
+
+ return 0;
+}
+
+void kgdb_arch_exit(void)
+{
+ __debugger_ipi = old__debugger_ipi;
+ __debugger = old__debugger;
+ __debugger_bpt = old__debugger_bpt;
+ __debugger_sstep = old__debugger_sstep;
+ __debugger_iabr_match = old__debugger_iabr_match;
+ __debugger_dabr_match = old__debugger_dabr_match;
+ __debugger_fault_handler = old__debugger_fault_handler;
+}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 4ba2af12545..de79915452c 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -144,7 +144,6 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
kcb->kprobe_saved_msr = regs->msr;
}
-/* Called with kretprobe_lock held */
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -312,8 +311,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
INIT_HLIST_HEAD(&empty_rp);
- spin_lock_irqsave(&kretprobe_lock, flags);
- head = kretprobe_inst_table_head(current);
+ kretprobe_hash_lock(current, &head, &flags);
/*
* It is possible to have multiple instances associated with a given
@@ -352,7 +350,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
regs->nip = orig_ret_address;
reset_current_kprobe();
- spin_unlock_irqrestore(&kretprobe_lock, flags);
+ kretprobe_hash_unlock(current, &flags);
preempt_enable_no_resched();
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 827a5726a03..9f856a0c3e3 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -34,8 +34,9 @@
#include <asm/time.h>
#include <asm/prom.h>
#include <asm/vdso_datapage.h>
+#include <asm/vio.h>
-#define MODULE_VERS "1.7"
+#define MODULE_VERS "1.8"
#define MODULE_NAME "lparcfg"
/* #define LPARCFG_DEBUG */
@@ -129,32 +130,46 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v)
/*
* Methods used to fetch LPAR data when running on a pSeries platform.
*/
-static void log_plpar_hcall_return(unsigned long rc, char *tag)
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
{
- switch(rc) {
- case 0:
- return;
- case H_HARDWARE:
- printk(KERN_INFO "plpar-hcall (%s) "
- "Hardware fault\n", tag);
- return;
- case H_FUNCTION:
- printk(KERN_INFO "plpar-hcall (%s) "
- "Function not allowed\n", tag);
- return;
- case H_AUTHORITY:
- printk(KERN_INFO "plpar-hcall (%s) "
- "Not authorized to this function\n", tag);
- return;
- case H_PARAMETER:
- printk(KERN_INFO "plpar-hcall (%s) "
- "Bad parameter(s)\n",tag);
- return;
- default:
- printk(KERN_INFO "plpar-hcall (%s) "
- "Unexpected rc(0x%lx)\n", tag, rc);
- }
+ int rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+ rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+ mpp_data->entitled_mem = retbuf[0];
+ mpp_data->mapped_mem = retbuf[1];
+
+ mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+ mpp_data->pool_num = retbuf[2] & 0xffff;
+
+ mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+ mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+ mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
+
+ mpp_data->pool_size = retbuf[4];
+ mpp_data->loan_request = retbuf[5];
+ mpp_data->backing_mem = retbuf[6];
+
+ return rc;
}
+EXPORT_SYMBOL(h_get_mpp);
+
+struct hvcall_ppp_data {
+ u64 entitlement;
+ u64 unallocated_entitlement;
+ u16 group_num;
+ u16 pool_num;
+ u8 capped;
+ u8 weight;
+ u8 unallocated_weight;
+ u16 active_procs_in_pool;
+ u16 active_system_procs;
+};
/*
* H_GET_PPP hcall returns info in 4 parms.
@@ -176,27 +191,30 @@ static void log_plpar_hcall_return(unsigned long rc, char *tag)
* XXXX - Active processors in Physical Processor Pool.
* XXXX - Processors active on platform.
*/
-static unsigned int h_get_ppp(unsigned long *entitled,
- unsigned long *unallocated,
- unsigned long *aggregation,
- unsigned long *resource)
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
{
unsigned long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall(H_GET_PPP, retbuf);
- *entitled = retbuf[0];
- *unallocated = retbuf[1];
- *aggregation = retbuf[2];
- *resource = retbuf[3];
+ ppp_data->entitlement = retbuf[0];
+ ppp_data->unallocated_entitlement = retbuf[1];
+
+ ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+ ppp_data->pool_num = retbuf[2] & 0xffff;
- log_plpar_hcall_return(rc, "H_GET_PPP");
+ ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+ ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+ ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+ ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+ ppp_data->active_system_procs = retbuf[3] & 0xffff;
return rc;
}
-static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
+static unsigned h_pic(unsigned long *pool_idle_time,
+ unsigned long *num_procs)
{
unsigned long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
@@ -206,8 +224,87 @@ static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
*pool_idle_time = retbuf[0];
*num_procs = retbuf[1];
- if (rc != H_AUTHORITY)
- log_plpar_hcall_return(rc, "H_PIC");
+ return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+ struct hvcall_ppp_data ppp_data;
+ int rc;
+
+ rc = h_get_ppp(&ppp_data);
+ if (rc)
+ return;
+
+ seq_printf(m, "partition_entitled_capacity=%ld\n",
+ ppp_data.entitlement);
+ seq_printf(m, "group=%d\n", ppp_data.group_num);
+ seq_printf(m, "system_active_processors=%d\n",
+ ppp_data.active_system_procs);
+
+ /* pool related entries are apropriate for shared configs */
+ if (lppaca[0].shared_proc) {
+ unsigned long pool_idle_time, pool_procs;
+
+ seq_printf(m, "pool=%d\n", ppp_data.pool_num);
+
+ /* report pool_capacity in percentage */
+ seq_printf(m, "pool_capacity=%d\n",
+ ppp_data.active_procs_in_pool * 100);
+
+ h_pic(&pool_idle_time, &pool_procs);
+ seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+ seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+ }
+
+ seq_printf(m, "unallocated_capacity_weight=%d\n",
+ ppp_data.unallocated_weight);
+ seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+ seq_printf(m, "capped=%d\n", ppp_data.capped);
+ seq_printf(m, "unallocated_capacity=%ld\n",
+ ppp_data.unallocated_entitlement);
+}
+
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+ struct hvcall_mpp_data mpp_data;
+ int rc;
+
+ rc = h_get_mpp(&mpp_data);
+ if (rc)
+ return;
+
+ seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+ if (mpp_data.mapped_mem != -1)
+ seq_printf(m, "mapped_entitled_memory=%ld\n",
+ mpp_data.mapped_mem);
+
+ seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+ seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+ seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+ seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+ mpp_data.unallocated_mem_weight);
+ seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+ mpp_data.unallocated_entitlement);
+
+ if (mpp_data.pool_size != -1)
+ seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+ mpp_data.pool_size);
+
+ seq_printf(m, "entitled_memory_loan_request=%ld\n",
+ mpp_data.loan_request);
+
+ seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
}
#define SPLPAR_CHARACTERISTICS_TOKEN 20
@@ -313,6 +410,25 @@ static int lparcfg_count_active_processors(void)
return count;
}
+static void pseries_cmo_data(struct seq_file *m)
+{
+ int cpu;
+ unsigned long cmo_faults = 0;
+ unsigned long cmo_fault_time = 0;
+
+ if (!firmware_has_feature(FW_FEATURE_CMO))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ cmo_faults += lppaca[cpu].cmo_faults;
+ cmo_fault_time += lppaca[cpu].cmo_fault_time;
+ }
+
+ seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+ seq_printf(m, "cmo_fault_time_usec=%lu\n",
+ cmo_fault_time / tb_ticks_per_usec);
+}
+
static int pseries_lparcfg_data(struct seq_file *m, void *v)
{
int partition_potential_processors;
@@ -334,60 +450,13 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
partition_active_processors = lparcfg_count_active_processors();
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
- unsigned long h_entitled, h_unallocated;
- unsigned long h_aggregation, h_resource;
- unsigned long pool_idle_time, pool_procs;
- unsigned long purr;
-
- h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
- &h_resource);
-
- seq_printf(m, "R4=0x%lx\n", h_entitled);
- seq_printf(m, "R5=0x%lx\n", h_unallocated);
- seq_printf(m, "R6=0x%lx\n", h_aggregation);
- seq_printf(m, "R7=0x%lx\n", h_resource);
-
- purr = get_purr();
-
/* this call handles the ibm,get-system-parameter contents */
parse_system_parameter_string(m);
+ parse_ppp_data(m);
+ parse_mpp_data(m);
+ pseries_cmo_data(m);
- seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
-
- seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
-
- seq_printf(m, "system_active_processors=%ld\n",
- (h_resource >> 0 * 8) & 0xffff);
-
- /* pool related entries are apropriate for shared configs */
- if (lppaca[0].shared_proc) {
-
- h_pic(&pool_idle_time, &pool_procs);
-
- seq_printf(m, "pool=%ld\n",
- (h_aggregation >> 0 * 8) & 0xffff);
-
- /* report pool_capacity in percentage */
- seq_printf(m, "pool_capacity=%ld\n",
- ((h_resource >> 2 * 8) & 0xffff) * 100);
-
- seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
-
- seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
- }
-
- seq_printf(m, "unallocated_capacity_weight=%ld\n",
- (h_resource >> 4 * 8) & 0xFF);
-
- seq_printf(m, "capacity_weight=%ld\n",
- (h_resource >> 5 * 8) & 0xFF);
-
- seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
-
- seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
-
- seq_printf(m, "purr=%ld\n", purr);
-
+ seq_printf(m, "purr=%ld\n", get_purr());
} else { /* non SPLPAR case */
seq_printf(m, "system_active_processors=%d\n",
@@ -414,6 +483,83 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
return 0;
}
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+ struct hvcall_ppp_data ppp_data;
+ u8 new_weight;
+ u64 new_entitled;
+ ssize_t retval;
+
+ /* Get our current parameters */
+ retval = h_get_ppp(&ppp_data);
+ if (retval)
+ return retval;
+
+ if (entitlement) {
+ new_weight = ppp_data.weight;
+ new_entitled = *entitlement;
+ } else if (weight) {
+ new_weight = *weight;
+ new_entitled = ppp_data.entitlement;
+ } else
+ return -EINVAL;
+
+ pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+ __FUNCTION__, ppp_data.entitlement, ppp_data.weight);
+
+ pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+ __FUNCTION__, new_entitled, new_weight);
+
+ retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+ return retval;
+}
+
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition. Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+ struct hvcall_mpp_data mpp_data;
+ u64 new_entitled;
+ u8 new_weight;
+ ssize_t rc;
+
+ if (entitlement) {
+ /* Check with vio to ensure the new memory entitlement
+ * can be handled.
+ */
+ rc = vio_cmo_entitlement_update(*entitlement);
+ if (rc)
+ return rc;
+ }
+
+ rc = h_get_mpp(&mpp_data);
+ if (rc)
+ return rc;
+
+ if (entitlement) {
+ new_weight = mpp_data.mem_weight;
+ new_entitled = *entitlement;
+ } else if (weight) {
+ new_weight = *weight;
+ new_entitled = mpp_data.entitled_mem;
+ } else
+ return -EINVAL;
+
+ pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+ __FUNCTION__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+ pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+ __FUNCTION__, new_entitled, new_weight);
+
+ rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+ return rc;
+}
+
/*
* Interface for changing system parameters (variable capacity weight
* and entitled capacity). Format of input is "param_name=value";
@@ -427,35 +573,27 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
static ssize_t lparcfg_write(struct file *file, const char __user * buf,
size_t count, loff_t * off)
{
- char *kbuf;
+ int kbuf_sz = 64;
+ char kbuf[kbuf_sz];
char *tmp;
u64 new_entitled, *new_entitled_ptr = &new_entitled;
u8 new_weight, *new_weight_ptr = &new_weight;
-
- unsigned long current_entitled; /* parameters for h_get_ppp */
- unsigned long dummy;
- unsigned long resource;
- u8 current_weight;
-
- ssize_t retval = -ENOMEM;
+ ssize_t retval;
if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
firmware_has_feature(FW_FEATURE_ISERIES))
return -EINVAL;
- kbuf = kmalloc(count, GFP_KERNEL);
- if (!kbuf)
- goto out;
+ if (count > kbuf_sz)
+ return -EINVAL;
- retval = -EFAULT;
if (copy_from_user(kbuf, buf, count))
- goto out;
+ return -EFAULT;
- retval = -EINVAL;
kbuf[count - 1] = '\0';
tmp = strchr(kbuf, '=');
if (!tmp)
- goto out;
+ return -EINVAL;
*tmp++ = '\0';
@@ -463,34 +601,32 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
char *endp;
*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
if (endp == tmp)
- goto out;
- new_weight_ptr = &current_weight;
+ return -EINVAL;
+
+ retval = update_ppp(new_entitled_ptr, NULL);
} else if (!strcmp(kbuf, "capacity_weight")) {
char *endp;
*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
if (endp == tmp)
- goto out;
- new_entitled_ptr = &current_entitled;
- } else
- goto out;
-
- /* Get our current parameters */
- retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
- if (retval) {
- retval = -EIO;
- goto out;
- }
-
- current_weight = (resource >> 5 * 8) & 0xFF;
+ return -EINVAL;
- pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
- __func__, current_entitled, current_weight);
+ retval = update_ppp(NULL, new_weight_ptr);
+ } else if (!strcmp(kbuf, "entitled_memory")) {
+ char *endp;
+ *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+ if (endp == tmp)
+ return -EINVAL;
- pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
- __func__, *new_entitled_ptr, *new_weight_ptr);
+ retval = update_mpp(new_entitled_ptr, NULL);
+ } else if (!strcmp(kbuf, "entitled_memory_weight")) {
+ char *endp;
+ *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+ if (endp == tmp)
+ return -EINVAL;
- retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr,
- *new_weight_ptr);
+ retval = update_mpp(NULL, new_weight_ptr);
+ } else
+ return -EINVAL;
if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
retval = count;
@@ -506,8 +642,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
retval = -EIO;
}
-out:
- kfree(kbuf);
return retval;
}
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 063cdd41304..224e9a11765 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -598,6 +598,7 @@ void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose,
res->start = pci_addr;
break;
case 2: /* PCI Memory space */
+ case 3: /* PCI 64 bits Memory space */
printk(KERN_INFO
" MEM 0x%016llx..0x%016llx -> 0x%016llx %s\n",
cpu_addr, cpu_addr + size - 1, pci_addr,
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 219f3634115..db2497ccc11 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -47,6 +47,8 @@
#ifdef CONFIG_PPC64
#include <asm/firmware.h>
#endif
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
extern unsigned long _get_SP(void);
@@ -239,6 +241,35 @@ void discard_lazy_cpu_state(void)
}
#endif /* CONFIG_SMP */
+void do_dabr(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ siginfo_t info;
+
+ if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+ 11, SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ if (debugger_dabr_match(regs))
+ return;
+
+ /* Clear the DAC and struct entries. One shot trigger */
+#if (defined(CONFIG_44x) || defined(CONFIG_BOOKE))
+ mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | DBSR_DAC1W
+ | DBCR0_IDM));
+#endif
+
+ /* Clear the DABR */
+ set_dabr(0);
+
+ /* Deliver the signal to userspace */
+ info.si_signo = SIGTRAP;
+ info.si_errno = 0;
+ info.si_code = TRAP_HWBKPT;
+ info.si_addr = (void __user *)address;
+ force_sig_info(SIGTRAP, &info, current);
+}
+
static DEFINE_PER_CPU(unsigned long, current_dabr);
int set_dabr(unsigned long dabr)
@@ -254,6 +285,11 @@ int set_dabr(unsigned long dabr)
#if defined(CONFIG_PPC64) || defined(CONFIG_6xx)
mtspr(SPRN_DABR, dabr);
#endif
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+ mtspr(SPRN_DAC1, dabr);
+#endif
+
return 0;
}
@@ -337,6 +373,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
set_dabr(new->thread.dabr);
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+ /* If new thread DAC (HW breakpoint) is the same then leave it */
+ if (new->thread.dabr)
+ set_dabr(new->thread.dabr);
+#endif
+
new_thread = &new->thread;
old_thread = &current->thread;
@@ -525,6 +567,10 @@ void flush_thread(void)
if (current->thread.dabr) {
current->thread.dabr = 0;
set_dabr(0);
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+ current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W);
+#endif
}
}
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1ea8c8d3ce8..c4ab2195b9c 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -643,6 +643,11 @@ static void __init early_cmdline_parse(void)
#else
#define OV5_MSI 0x00
#endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO 0x80 /* Cooperative Memory Overcommitment */
+#else
+#define OV5_CMO 0x00
+#endif
/*
* The architecture vector has an array of PVR mask/value pairs,
@@ -687,10 +692,12 @@ static unsigned char ibm_architecture_vec[] = {
0, /* don't halt */
/* option vector 5: PAPR/OF options */
- 3 - 2, /* length */
+ 5 - 2, /* length */
0, /* don't ignore, don't halt */
OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
OV5_DONATE_DEDICATE_CPU | OV5_MSI,
+ 0,
+ OV5_CMO,
};
/* Old method - ELF header with PT_NOTE sections */
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 90eb3a3e383..bc1fb27368a 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -128,12 +128,35 @@ static void of_bus_pci_count_cells(struct device_node *np,
*sizec = 2;
}
+static unsigned int of_bus_pci_get_flags(const u32 *addr)
+{
+ unsigned int flags = 0;
+ u32 w = addr[0];
+
+ switch((w >> 24) & 0x03) {
+ case 0x01:
+ flags |= IORESOURCE_IO;
+ break;
+ case 0x02: /* 32 bits */
+ case 0x03: /* 64 bits */
+ flags |= IORESOURCE_MEM;
+ break;
+ }
+ if (w & 0x40000000)
+ flags |= IORESOURCE_PREFETCH;
+ return flags;
+}
+
static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
{
u64 cp, s, da;
+ unsigned int af, rf;
+
+ af = of_bus_pci_get_flags(addr);
+ rf = of_bus_pci_get_flags(range);
/* Check address type match */
- if ((addr[0] ^ range[0]) & 0x03000000)
+ if ((af ^ rf) & (IORESOURCE_MEM | IORESOURCE_IO))
return OF_BAD_ADDR;
/* Read address values, skipping high cell */
@@ -153,25 +176,6 @@ static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
return of_bus_default_translate(addr + 1, offset, na - 1);
}
-static unsigned int of_bus_pci_get_flags(const u32 *addr)
-{
- unsigned int flags = 0;
- u32 w = addr[0];
-
- switch((w >> 24) & 0x03) {
- case 0x01:
- flags |= IORESOURCE_IO;
- break;
- case 0x02: /* 32 bits */
- case 0x03: /* 64 bits */
- flags |= IORESOURCE_MEM;
- break;
- }
- if (w & 0x40000000)
- flags |= IORESOURCE_PREFETCH;
- return flags;
-}
-
const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
unsigned int *flags)
{
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 8feb93e7890..a5d0e78779c 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -703,7 +703,7 @@ void user_enable_single_step(struct task_struct *task)
if (regs != NULL) {
#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
- task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC;
+ task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
regs->msr |= MSR_DE;
#else
regs->msr |= MSR_SE;
@@ -716,9 +716,16 @@ void user_disable_single_step(struct task_struct *task)
{
struct pt_regs *regs = task->thread.regs;
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+ /* If DAC then do not single step, skip */
+ if (task->thread.dabr)
+ return;
+#endif
+
if (regs != NULL) {
#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
- task->thread.dbcr0 = 0;
+ task->thread.dbcr0 &= ~(DBCR0_IC | DBCR0_IDM);
regs->msr &= ~MSR_DE;
#else
regs->msr &= ~MSR_SE;
@@ -727,22 +734,75 @@ void user_disable_single_step(struct task_struct *task)
clear_tsk_thread_flag(task, TIF_SINGLESTEP);
}
-static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
+int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
unsigned long data)
{
- /* We only support one DABR and no IABRS at the moment */
+ /* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
+ * For embedded processors we support one DAC and no IAC's at the
+ * moment.
+ */
if (addr > 0)
return -EINVAL;
- /* The bottom 3 bits are flags */
if ((data & ~0x7UL) >= TASK_SIZE)
return -EIO;
- /* Ensure translation is on */
+#ifdef CONFIG_PPC64
+
+ /* For processors using DABR (i.e. 970), the bottom 3 bits are flags.
+ * It was assumed, on previous implementations, that 3 bits were
+ * passed together with the data address, fitting the design of the
+ * DABR register, as follows:
+ *
+ * bit 0: Read flag
+ * bit 1: Write flag
+ * bit 2: Breakpoint translation
+ *
+ * Thus, we use them here as so.
+ */
+
+ /* Ensure breakpoint translation bit is set */
if (data && !(data & DABR_TRANSLATION))
return -EIO;
+ /* Move contents to the DABR register */
task->thread.dabr = data;
+
+#endif
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+
+ /* As described above, it was assumed 3 bits were passed with the data
+ * address, but we will assume only the mode bits will be passed
+ * as to not cause alignment restrictions for DAC-based processors.
+ */
+
+ /* DAC's hold the whole address without any mode flags */
+ task->thread.dabr = data & ~0x3UL;
+
+ if (task->thread.dabr == 0) {
+ task->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | DBCR0_IDM);
+ task->thread.regs->msr &= ~MSR_DE;
+ return 0;
+ }
+
+ /* Read or Write bits must be set */
+
+ if (!(data & 0x3UL))
+ return -EINVAL;
+
+ /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
+ register */
+ task->thread.dbcr0 = DBCR0_IDM;
+
+ /* Check for write and read flags and set DBCR0
+ accordingly */
+ if (data & 0x1UL)
+ task->thread.dbcr0 |= DBSR_DAC1R;
+ if (data & 0x2UL)
+ task->thread.dbcr0 |= DBSR_DAC1W;
+
+ task->thread.regs->msr |= MSR_DE;
+#endif
return 0;
}
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 4efebe88e64..066e65c59b5 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -43,10 +43,6 @@
#define DBG(fmt...)
-#if defined CONFIG_KGDB
-#include <asm/kgdb.h>
-#endif
-
extern void bootx_init(unsigned long r4, unsigned long phys);
int boot_cpuid;
@@ -302,18 +298,6 @@ void __init setup_arch(char **cmdline_p)
xmon_setup();
-#if defined(CONFIG_KGDB)
- if (ppc_md.kgdb_map_scc)
- ppc_md.kgdb_map_scc();
- set_debug_traps();
- if (strstr(cmd_line, "gdb")) {
- if (ppc_md.progress)
- ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000);
- printk("kgdb breakpoint activated\n");
- breakpoint();
- }
-#endif
-
/*
* Set cache line size based on type of cpu as a default.
* Systems with OF can look in the properties on the cpu node(s)
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index ad55488939c..7aada783ec6 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -145,8 +145,12 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
* user space. The DABR will have been cleared if it
* triggered inside the kernel.
*/
- if (current->thread.dabr)
+ if (current->thread.dabr) {
set_dabr(current->thread.dabr);
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+ mtspr(SPRN_DBCR0, current->thread.dbcr0);
+#endif
+ }
if (is32) {
if (ka.sa.sa_flags & SA_SIGINFO)
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 071bee3ec74..f2589645870 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -59,6 +59,6 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- save_context_stack(trace, tsk->thread.regs->gpr[1], tsk, 0);
+ save_context_stack(trace, tsk->thread.ksp, tsk, 0);
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index 8cee5710754..6fc6328dc62 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -7,6 +7,7 @@
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
+#include <linux/mm.h>
#include <asm/page.h>
/* References to section boundaries */
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c8127f832df..800e5e9a087 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -28,7 +28,9 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
/* Time in microseconds we delay before sleeping in the idle loop */
DEFINE_PER_CPU(unsigned long, smt_snooze_delay) = { 100 };
-static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
+static ssize_t store_smt_snooze_delay(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf,
size_t count)
{
struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -44,7 +46,9 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
return count;
}
-static ssize_t show_smt_snooze_delay(struct sys_device *dev, char *buf)
+static ssize_t show_smt_snooze_delay(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ char *buf)
{
struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -152,14 +156,17 @@ static unsigned long write_##NAME(unsigned long val) \
mtspr(ADDRESS, val); \
return 0; \
} \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
+ char *buf) \
{ \
struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
unsigned long val = run_on_cpu(cpu->sysdev.id, read_##NAME, 0); \
return sprintf(buf, "%lx\n", val); \
} \
static ssize_t __used \
- store_##NAME(struct sys_device *dev, const char *buf, size_t count) \
+ store_##NAME(struct sys_device *dev, struct sysdev_attribute *attr, \
+ const char *buf, size_t count) \
{ \
struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
unsigned long val; \
@@ -522,7 +529,8 @@ static void register_nodes(void)
#endif
/* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev, char *buf)
+static ssize_t show_physical_id(struct sys_device *dev,
+ struct sysdev_attribute *attr, char *buf)
{
struct cpu *cpu = container_of(dev, struct cpu, sysdev);
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 878fbddb6ae..81ccb8dd1a5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1067,6 +1067,22 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
}
_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+ } else if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
+ regs->msr &= ~MSR_DE;
+
+ if (user_mode(regs)) {
+ current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W |
+ DBCR0_IDM);
+ } else {
+ /* Disable DAC interupts */
+ mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R |
+ DBSR_DAC1W | DBCR0_IDM));
+
+ /* Clear the DAC event */
+ mtspr(SPRN_DBSR, (DBSR_DAC1R | DBSR_DAC1W));
+ }
+ /* Setup and send the trap to the handler */
+ do_dabr(regs, mfspr(SPRN_DAC1), debug_status);
}
}
#endif /* CONFIG_4xx || CONFIG_BOOKE */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index b77f8af7ddd..ade8aeaa2e7 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1,11 +1,12 @@
/*
* IBM PowerPC Virtual I/O Infrastructure Support.
*
- * Copyright (c) 2003-2005 IBM Corp.
+ * Copyright (c) 2003,2008 IBM Corp.
* Dave Engebretsen engebret@us.ibm.com
* Santiago Leon santil@us.ibm.com
* Hollis Blanchard <hollisb@us.ibm.com>
* Stephen Rothwell
+ * Robert Jennings <rcjenn@us.ibm.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -46,6 +47,996 @@ static struct vio_dev vio_bus_device = { /* fake "parent" device */
.dev.bus = &vio_bus_type,
};
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+ size_t size;
+ size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+ struct vio_dev *viodev;
+ struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+struct vio_cmo {
+ spinlock_t lock;
+ struct delayed_work balance_q;
+ struct list_head device_list;
+ size_t entitled;
+ struct vio_cmo_pool reserve;
+ struct vio_cmo_pool excess;
+ size_t spare;
+ size_t min;
+ size_t desired;
+ size_t curr;
+ size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+ struct device_node *node_vroot;
+ int count = 0;
+
+ /*
+ * Count the number of vdevice entries with an
+ * ibm,my-dma-window OF property
+ */
+ node_vroot = of_find_node_by_name(NULL, "vdevice");
+ if (node_vroot) {
+ struct device_node *of_node;
+ struct property *prop;
+
+ for_each_child_of_node(node_vroot, of_node) {
+ prop = of_find_property(of_node, "ibm,my-dma-window",
+ NULL);
+ if (prop)
+ count++;
+ }
+ }
+ of_node_put(node_vroot);
+ return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices. The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ * 0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+ unsigned long flags;
+ size_t reserve_free = 0;
+ size_t excess_free = 0;
+ int ret = -ENOMEM;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+
+ /* Determine the amount of free entitlement available in reserve */
+ if (viodev->cmo.entitled > viodev->cmo.allocated)
+ reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+ /* If spare is not fulfilled, the excess pool can not be used. */
+ if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+ excess_free = vio_cmo.excess.free;
+
+ /* The request can be satisfied */
+ if ((reserve_free + excess_free) >= size) {
+ vio_cmo.curr += size;
+ if (vio_cmo.curr > vio_cmo.high)
+ vio_cmo.high = vio_cmo.curr;
+ viodev->cmo.allocated += size;
+ size -= min(reserve_free, size);
+ vio_cmo.excess.free -= size;
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+ unsigned long flags;
+ size_t spare_needed = 0;
+ size_t excess_freed = 0;
+ size_t reserve_freed = size;
+ size_t tmp;
+ int balance = 0;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ vio_cmo.curr -= size;
+
+ /* Amount of memory freed from the excess pool */
+ if (viodev->cmo.allocated > viodev->cmo.entitled) {
+ excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+ viodev->cmo.entitled));
+ reserve_freed -= excess_freed;
+ }
+
+ /* Remove allocation from device */
+ viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+ /* Spare is a subset of the reserve pool, replenish it first. */
+ spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+ /*
+ * Replenish the spare in the reserve pool from the excess pool.
+ * This moves entitlement into the reserve pool.
+ */
+ if (spare_needed && excess_freed) {
+ tmp = min(excess_freed, spare_needed);
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.reserve.size += tmp;
+ vio_cmo.spare += tmp;
+ excess_freed -= tmp;
+ spare_needed -= tmp;
+ balance = 1;
+ }
+
+ /*
+ * Replenish the spare in the reserve pool from the reserve pool.
+ * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+ * if needed, and gives it to the spare pool. The amount of used
+ * memory in this pool does not change.
+ */
+ if (spare_needed && reserve_freed) {
+ tmp = min(spare_needed, min(reserve_freed,
+ (viodev->cmo.entitled -
+ VIO_CMO_MIN_ENT)));
+
+ vio_cmo.spare += tmp;
+ viodev->cmo.entitled -= tmp;
+ reserve_freed -= tmp;
+ spare_needed -= tmp;
+ balance = 1;
+ }
+
+ /*
+ * Increase the reserve pool until the desired allocation is met.
+ * Move an allocation freed from the excess pool into the reserve
+ * pool and schedule a balance operation.
+ */
+ if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+ tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.reserve.size += tmp;
+ excess_freed -= tmp;
+ balance = 1;
+ }
+
+ /* Return memory from the excess pool to that pool */
+ if (excess_freed)
+ vio_cmo.excess.free += excess_freed;
+
+ if (balance)
+ schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool. Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+ struct vio_dev *viodev;
+ struct vio_cmo_dev_entry *dev_ent;
+ unsigned long flags;
+ size_t avail, delta, tmp;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+
+ /* Entitlement increases */
+ if (new_entitlement > vio_cmo.entitled) {
+ delta = new_entitlement - vio_cmo.entitled;
+
+ /* Fulfill spare allocation */
+ if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+ tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+ vio_cmo.spare += tmp;
+ vio_cmo.reserve.size += tmp;
+ delta -= tmp;
+ }
+
+ /* Remaining new allocation goes to the excess pool */
+ vio_cmo.entitled += delta;
+ vio_cmo.excess.size += delta;
+ vio_cmo.excess.free += delta;
+
+ goto out;
+ }
+
+ /* Entitlement decreases */
+ delta = vio_cmo.entitled - new_entitlement;
+ avail = vio_cmo.excess.free;
+
+ /*
+ * Need to check how much unused entitlement each device can
+ * sacrifice to fulfill entitlement change.
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ if (avail >= delta)
+ break;
+
+ viodev = dev_ent->viodev;
+ if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+ (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+ avail += viodev->cmo.entitled -
+ max_t(size_t, viodev->cmo.allocated,
+ VIO_CMO_MIN_ENT);
+ }
+
+ if (delta <= avail) {
+ vio_cmo.entitled -= delta;
+
+ /* Take entitlement from the excess pool first */
+ tmp = min(vio_cmo.excess.free, delta);
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.excess.free -= tmp;
+ delta -= tmp;
+
+ /*
+ * Remove all but VIO_CMO_MIN_ENT bytes from devices
+ * until entitlement change is served
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ if (!delta)
+ break;
+
+ viodev = dev_ent->viodev;
+ tmp = 0;
+ if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+ (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+ tmp = viodev->cmo.entitled -
+ max_t(size_t, viodev->cmo.allocated,
+ VIO_CMO_MIN_ENT);
+ viodev->cmo.entitled -= min(tmp, delta);
+ delta -= min(tmp, delta);
+ }
+ } else {
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return -ENOMEM;
+ }
+
+out:
+ schedule_delayed_work(&vio_cmo.balance_q, 0);
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver. The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+ struct vio_cmo *cmo;
+ struct vio_dev *viodev;
+ struct vio_cmo_dev_entry *dev_ent;
+ unsigned long flags;
+ size_t avail = 0, level, chunk, need;
+ int devcount = 0, fulfilled;
+
+ cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+
+ /* Calculate minimum entitlement and fulfill spare */
+ cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+ BUG_ON(cmo->min > cmo->entitled);
+ cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+ cmo->min += cmo->spare;
+ cmo->desired = cmo->min;
+
+ /*
+ * Determine how much entitlement is available and reset device
+ * entitlements
+ */
+ avail = cmo->entitled - cmo->spare;
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ viodev = dev_ent->viodev;
+ devcount++;
+ viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+ cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+ avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+ }
+
+ /*
+ * Having provided each device with the minimum entitlement, loop
+ * over the devices portioning out the remaining entitlement
+ * until there is nothing left.
+ */
+ level = VIO_CMO_MIN_ENT;
+ while (avail) {
+ fulfilled = 0;
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ viodev = dev_ent->viodev;
+
+ if (viodev->cmo.desired <= level) {
+ fulfilled++;
+ continue;
+ }
+
+ /*
+ * Give the device up to VIO_CMO_BALANCE_CHUNK
+ * bytes of entitlement, but do not exceed the
+ * desired level of entitlement for the device.
+ */
+ chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+ chunk = min(chunk, (viodev->cmo.desired -
+ viodev->cmo.entitled));
+ viodev->cmo.entitled += chunk;
+
+ /*
+ * If the memory for this entitlement increase was
+ * already allocated to the device it does not come
+ * from the available pool being portioned out.
+ */
+ need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+ max(viodev->cmo.allocated, level);
+ avail -= need;
+
+ }
+ if (fulfilled == devcount)
+ break;
+ level += VIO_CMO_BALANCE_CHUNK;
+ }
+
+ /* Calculate new reserve and excess pool sizes */
+ cmo->reserve.size = cmo->min;
+ cmo->excess.free = 0;
+ cmo->excess.size = 0;
+ need = 0;
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+ viodev = dev_ent->viodev;
+ /* Calculated reserve size above the minimum entitlement */
+ if (viodev->cmo.entitled)
+ cmo->reserve.size += (viodev->cmo.entitled -
+ VIO_CMO_MIN_ENT);
+ /* Calculated used excess entitlement */
+ if (viodev->cmo.allocated > viodev->cmo.entitled)
+ need += viodev->cmo.allocated - viodev->cmo.entitled;
+ }
+ cmo->excess.size = cmo->entitled - cmo->reserve.size;
+ cmo->excess.free = cmo->excess.size - need;
+
+ cancel_delayed_work(container_of(work, struct delayed_work, work));
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ void *ret;
+
+ if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+ atomic_inc(&viodev->cmo.allocs_failed);
+ return NULL;
+ }
+
+ ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag);
+ if (unlikely(ret == NULL)) {
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+ atomic_inc(&viodev->cmo.allocs_failed);
+ }
+
+ return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+
+ dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle);
+
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr,
+ size_t size,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ dma_addr_t ret = DMA_ERROR_CODE;
+
+ if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+ atomic_inc(&viodev->cmo.allocs_failed);
+ return ret;
+ }
+
+ ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs);
+ if (unlikely(dma_mapping_error(ret))) {
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+ atomic_inc(&viodev->cmo.allocs_failed);
+ }
+
+ return ret;
+}
+
+static void vio_dma_iommu_unmap_single(struct device *dev,
+ dma_addr_t dma_handle, size_t size,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+
+ dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs);
+
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct scatterlist *sgl;
+ int ret, count = 0;
+ size_t alloc_size = 0;
+
+ for (sgl = sglist; count < nelems; count++, sgl++)
+ alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+
+ if (vio_cmo_alloc(viodev, alloc_size)) {
+ atomic_inc(&viodev->cmo.allocs_failed);
+ return 0;
+ }
+
+ ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+ if (unlikely(!ret)) {
+ vio_cmo_dealloc(viodev, alloc_size);
+ atomic_inc(&viodev->cmo.allocs_failed);
+ }
+
+ for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+ alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+ if (alloc_size)
+ vio_cmo_dealloc(viodev, alloc_size);
+
+ return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+ struct scatterlist *sglist, int nelems,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ struct scatterlist *sgl;
+ size_t alloc_size = 0;
+ int count = 0;
+
+ for (sgl = sglist; count < nelems; count++, sgl++)
+ alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+
+ dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+ vio_cmo_dealloc(viodev, alloc_size);
+}
+
+struct dma_mapping_ops vio_dma_mapping_ops = {
+ .alloc_coherent = vio_dma_iommu_alloc_coherent,
+ .free_coherent = vio_dma_iommu_free_coherent,
+ .map_single = vio_dma_iommu_map_single,
+ .unmap_single = vio_dma_iommu_unmap_single,
+ .map_sg = vio_dma_iommu_map_sg,
+ .unmap_sg = vio_dma_iommu_unmap_sg,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @new_desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs. The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+ unsigned long flags;
+ struct vio_cmo_dev_entry *dev_ent;
+ int found = 0;
+
+ if (!firmware_has_feature(FW_FEATURE_CMO))
+ return;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ if (desired < VIO_CMO_MIN_ENT)
+ desired = VIO_CMO_MIN_ENT;
+
+ /*
+ * Changes will not be made for devices not in the device list.
+ * If it is not in the device list, then no driver is loaded
+ * for the device and it can not receive entitlement.
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+ if (viodev == dev_ent->viodev) {
+ found = 1;
+ break;
+ }
+ if (!found)
+ return;
+
+ /* Increase/decrease in desired device entitlement */
+ if (desired >= viodev->cmo.desired) {
+ /* Just bump the bus and device values prior to a balance*/
+ vio_cmo.desired += desired - viodev->cmo.desired;
+ viodev->cmo.desired = desired;
+ } else {
+ /* Decrease bus and device values for desired entitlement */
+ vio_cmo.desired -= viodev->cmo.desired - desired;
+ viodev->cmo.desired = desired;
+ /*
+ * If less entitlement is desired than current entitlement, move
+ * any reserve memory in the change region to the excess pool.
+ */
+ if (viodev->cmo.entitled > desired) {
+ vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+ vio_cmo.excess.size += viodev->cmo.entitled - desired;
+ /*
+ * If entitlement moving from the reserve pool to the
+ * excess pool is currently unused, add to the excess
+ * free counter.
+ */
+ if (viodev->cmo.allocated < viodev->cmo.entitled)
+ vio_cmo.excess.free += viodev->cmo.entitled -
+ max(viodev->cmo.allocated, desired);
+ viodev->cmo.entitled = desired;
+ }
+ }
+ schedule_delayed_work(&vio_cmo.balance_q, 0);
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ * -ENOMEM when entitlement is not available for device or
+ * device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+ struct vio_cmo_dev_entry *dev_ent;
+ struct device *dev = &viodev->dev;
+ struct vio_driver *viodrv = to_vio_driver(dev->driver);
+ unsigned long flags;
+ size_t size;
+
+ /*
+ * Check to see that device has a DMA window and configure
+ * entitlement for the device.
+ */
+ if (of_get_property(viodev->dev.archdata.of_node,
+ "ibm,my-dma-window", NULL)) {
+ /* Check that the driver is CMO enabled and get desired DMA */
+ if (!viodrv->get_desired_dma) {
+ dev_err(dev, "%s: device driver does not support CMO\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+ if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+ viodev->cmo.desired = VIO_CMO_MIN_ENT;
+ size = VIO_CMO_MIN_ENT;
+
+ dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+ GFP_KERNEL);
+ if (!dev_ent)
+ return -ENOMEM;
+
+ dev_ent->viodev = viodev;
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ list_add(&dev_ent->list, &vio_cmo.device_list);
+ } else {
+ viodev->cmo.desired = 0;
+ size = 0;
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ }
+
+ /*
+ * If the needs for vio_cmo.min have not changed since they
+ * were last set, the number of devices in the OF tree has
+ * been constant and the IO memory for this is already in
+ * the reserve pool.
+ */
+ if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+ VIO_CMO_MIN_ENT)) {
+ /* Updated desired entitlement if device requires it */
+ if (size)
+ vio_cmo.desired += (viodev->cmo.desired -
+ VIO_CMO_MIN_ENT);
+ } else {
+ size_t tmp;
+
+ tmp = vio_cmo.spare + vio_cmo.excess.free;
+ if (tmp < size) {
+ dev_err(dev, "%s: insufficient free "
+ "entitlement to add device. "
+ "Need %lu, have %lu\n", __func__,
+ size, (vio_cmo.spare + tmp));
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return -ENOMEM;
+ }
+
+ /* Use excess pool first to fulfill request */
+ tmp = min(size, vio_cmo.excess.free);
+ vio_cmo.excess.free -= tmp;
+ vio_cmo.excess.size -= tmp;
+ vio_cmo.reserve.size += tmp;
+
+ /* Use spare if excess pool was insufficient */
+ vio_cmo.spare -= size - tmp;
+
+ /* Update bus accounting */
+ vio_cmo.min += size;
+ vio_cmo.desired += viodev->cmo.desired;
+ }
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+ return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list. The minimum entitlement
+ * will be reserved for the device as long as it is in the system. The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+ struct vio_cmo_dev_entry *dev_ent;
+ unsigned long flags;
+ size_t tmp;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ if (viodev->cmo.allocated) {
+ dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+ "allocated after remove operation.\n",
+ __func__, viodev->cmo.allocated);
+ BUG();
+ }
+
+ /*
+ * Remove the device from the device list being maintained for
+ * CMO enabled devices.
+ */
+ list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+ if (viodev == dev_ent->viodev) {
+ list_del(&dev_ent->list);
+ kfree(dev_ent);
+ break;
+ }
+
+ /*
+ * Devices may not require any entitlement and they do not need
+ * to be processed. Otherwise, return the device's entitlement
+ * back to the pools.
+ */
+ if (viodev->cmo.entitled) {
+ /*
+ * This device has not yet left the OF tree, it's
+ * minimum entitlement remains in vio_cmo.min and
+ * vio_cmo.desired
+ */
+ vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+ /*
+ * Save min allocation for device in reserve as long
+ * as it exists in OF tree as determined by later
+ * balance operation
+ */
+ viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+ /* Replenish spare from freed reserve pool */
+ if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+ tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+ vio_cmo.spare));
+ vio_cmo.spare += tmp;
+ viodev->cmo.entitled -= tmp;
+ }
+
+ /* Remaining reserve goes to excess pool */
+ vio_cmo.excess.size += viodev->cmo.entitled;
+ vio_cmo.excess.free += viodev->cmo.entitled;
+ vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+ /*
+ * Until the device is removed it will keep a
+ * minimum entitlement; this will guarantee that
+ * a module unload/load will result in a success.
+ */
+ viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+ viodev->cmo.desired = VIO_CMO_MIN_ENT;
+ atomic_set(&viodev->cmo.allocs_failed, 0);
+ }
+
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+ vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported;
+ viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops;
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+ struct hvcall_mpp_data mpp_data;
+ int err;
+
+ memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+ spin_lock_init(&vio_cmo.lock);
+ INIT_LIST_HEAD(&vio_cmo.device_list);
+ INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+ /* Get current system entitlement */
+ err = h_get_mpp(&mpp_data);
+
+ /*
+ * On failure, continue with entitlement set to 0, will panic()
+ * later when spare is reserved.
+ */
+ if (err != H_SUCCESS) {
+ printk(KERN_ERR "%s: unable to determine system IO "\
+ "entitlement. (%d)\n", __func__, err);
+ vio_cmo.entitled = 0;
+ } else {
+ vio_cmo.entitled = mpp_data.entitled_mem;
+ }
+
+ /* Set reservation and check against entitlement */
+ vio_cmo.spare = VIO_CMO_MIN_ENT;
+ vio_cmo.reserve.size = vio_cmo.spare;
+ vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+ VIO_CMO_MIN_ENT);
+ if (vio_cmo.reserve.size > vio_cmo.entitled) {
+ printk(KERN_ERR "%s: insufficient system entitlement\n",
+ __func__);
+ panic("%s: Insufficient system entitlement", __func__);
+ }
+
+ /* Set the remaining accounting variables */
+ vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+ vio_cmo.excess.free = vio_cmo.excess.size;
+ vio_cmo.min = vio_cmo.reserve.size;
+ vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name) \
+static ssize_t viodev_cmo_##name##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name); \
+}
+
+static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ atomic_set(&viodev->cmo.allocs_failed, 0);
+ return count;
+}
+
+static ssize_t viodev_cmo_desired_set(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct vio_dev *viodev = to_vio_dev(dev);
+ size_t new_desired;
+ int ret;
+
+ ret = strict_strtoul(buf, 10, &new_desired);
+ if (ret)
+ return ret;
+
+ vio_cmo_set_dev_desired(viodev, new_desired);
+ return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static struct device_attribute vio_cmo_dev_attrs[] = {
+ __ATTR_RO(name),
+ __ATTR_RO(devspec),
+ __ATTR(cmo_desired, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+ viodev_cmo_desired_show, viodev_cmo_desired_set),
+ __ATTR(cmo_entitled, S_IRUGO, viodev_cmo_entitled_show, NULL),
+ __ATTR(cmo_allocated, S_IRUGO, viodev_cmo_allocated_show, NULL),
+ __ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+ viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
+ __ATTR_NULL
+};
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name) \
+static ssize_t \
+viobus_cmo_##name##_show(struct bus_type *bt, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", vio_cmo.name); \
+}
+
+#define viobus_cmo_pool_rd_attr(name, var) \
+static ssize_t \
+viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", vio_cmo.name.var); \
+}
+
+static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
+ size_t count)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&vio_cmo.lock, flags);
+ vio_cmo.high = vio_cmo.curr;
+ spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+ return count;
+}
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_rd_attr(high);
+
+static struct bus_attribute vio_cmo_bus_attrs[] = {
+ __ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL),
+ __ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL),
+ __ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL),
+ __ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL),
+ __ATTR(cmo_spare, S_IRUGO, viobus_cmo_spare_show, NULL),
+ __ATTR(cmo_min, S_IRUGO, viobus_cmo_min_show, NULL),
+ __ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL),
+ __ATTR(cmo_curr, S_IRUGO, viobus_cmo_curr_show, NULL),
+ __ATTR(cmo_high, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+ viobus_cmo_high_show, viobus_cmo_high_reset),
+ __ATTR_NULL
+};
+
+static void vio_cmo_sysfs_init(void)
+{
+ vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
+ vio_bus_type.bus_attrs = vio_cmo_bus_attrs;
+}
+#else /* CONFIG_PPC_SMLPAR */
+/* Dummy functions for iSeries platform */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init() {}
+static void vio_cmo_sysfs_init() { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
{
const unsigned char *dma_window;
@@ -114,8 +1105,17 @@ static int vio_bus_probe(struct device *dev)
return error;
id = vio_match_device(viodrv->id_table, viodev);
- if (id)
+ if (id) {
+ memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ error = vio_cmo_bus_probe(viodev);
+ if (error)
+ return error;
+ }
error = viodrv->probe(viodev, id);
+ if (error)
+ vio_cmo_bus_remove(viodev);
+ }
return error;
}
@@ -125,12 +1125,23 @@ static int vio_bus_remove(struct device *dev)
{
struct vio_dev *viodev = to_vio_dev(dev);
struct vio_driver *viodrv = to_vio_driver(dev->driver);
+ struct device *devptr;
+ int ret = 1;
+
+ /*
+ * Hold a reference to the device after the remove function is called
+ * to allow for CMO accounting cleanup for the device.
+ */
+ devptr = get_device(dev);
if (viodrv->remove)
- return viodrv->remove(viodev);
+ ret = viodrv->remove(viodev);
+
+ if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_bus_remove(viodev);
- /* driver can't remove */
- return 1;
+ put_device(devptr);
+ return ret;
}
/**
@@ -215,7 +1226,11 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
viodev->unit_address = *unit_address;
}
viodev->dev.archdata.of_node = of_node_get(of_node);
- viodev->dev.archdata.dma_ops = &dma_iommu_ops;
+
+ if (firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_set_dma_ops(viodev);
+ else
+ viodev->dev.archdata.dma_ops = &dma_iommu_ops;
viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev);
viodev->dev.archdata.numa_node = of_node_to_nid(of_node);
@@ -245,6 +1260,9 @@ static int __init vio_bus_init(void)
int err;
struct device_node *node_vroot;
+ if (firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_sysfs_init();
+
err = bus_register(&vio_bus_type);
if (err) {
printk(KERN_ERR "failed to register VIO bus\n");
@@ -262,6 +1280,9 @@ static int __init vio_bus_init(void)
return err;
}
+ if (firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_bus_init();
+
node_vroot = of_find_node_by_name(NULL, "vdevice");
if (node_vroot) {
struct device_node *of_node;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 87a72c66ce2..4a8ce62fe11 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -9,6 +9,25 @@
ENTRY(_stext)
+PHDRS {
+ kernel PT_LOAD FLAGS(7); /* RWX */
+ notes PT_NOTE FLAGS(0);
+ dummy PT_NOTE FLAGS(0);
+
+ /* binutils < 2.18 has a bug that makes it misbehave when taking an
+ ELF file with all segments at load address 0 as input. This
+ happens when running "strip" on vmlinux, because of the AT() magic
+ in this linker script. People using GCC >= 4.2 won't run into
+ this problem, because the "build-id" support will put some data
+ into the "notes" segment (at a non-zero load address).
+
+ To work around this, we force some data into both the "dummy"
+ segment and the kernel segment, so the dummy segment will get a
+ non-zero load address. It's not enough to always create the
+ "notes" segment, since if nothing gets assigned to it, its load
+ address will be zero. */
+}
+
#ifdef CONFIG_PPC64
OUTPUT_ARCH(powerpc:common64)
jiffies = jiffies_64;
@@ -50,7 +69,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
_etext = .;
PROVIDE32 (etext = .);
- }
+ } :kernel
/* Read-only data */
RODATA
@@ -62,7 +81,13 @@ SECTIONS
__stop___ex_table = .;
}
- NOTES
+ NOTES :kernel :notes
+
+ /* The dummy segment contents for the bug workaround mentioned above
+ near PHDRS. */
+ .dummy : AT(ADDR(.dummy) - LOAD_OFFSET) {
+ LONG(0xf177)
+ } :kernel :dummy
/*
* Init sections discarded at runtime
@@ -74,7 +99,7 @@ SECTIONS
_sinittext = .;
INIT_TEXT
_einittext = .;
- }
+ } :kernel
/* .exit.text is discarded at runtime, not link time,
* to deal with references from __bug_table