From 0e4bc95d87394364f408627067238453830bdbf3 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 3 Jun 2012 19:43:02 +0000 Subject: powerpc/iommu: Reduce spinlock coverage in iommu_alloc and iommu_free We currently hold the IOMMU spinlock around tce_build and tce_flush. This causes our spinlock hold times to be much higher than required and can impact multiqueue adapters. This patch moves tce_build and tce_flush outside of the lock in iommu_alloc, and tce_flush outside of the lock in iommu_free. Some performance numbers were obtained with a Chelsio T3 adapter on two POWER7 boxes, running a 100 session TCP round robin test. Performance improved 32% with this patch applied. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/iommu.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/kernel/iommu.c') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 359f078571c..9c8967fa1e6 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -170,13 +170,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, int build_fail; spin_lock_irqsave(&(tbl->it_lock), flags); - entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); + spin_unlock_irqrestore(&(tbl->it_lock), flags); - if (unlikely(entry == DMA_ERROR_CODE)) { - spin_unlock_irqrestore(&(tbl->it_lock), flags); + if (unlikely(entry == DMA_ERROR_CODE)) return DMA_ERROR_CODE; - } entry += tbl->it_offset; /* Offset into real TCE table */ ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ @@ -192,9 +190,10 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, * not altered. */ if (unlikely(build_fail)) { + spin_lock_irqsave(&(tbl->it_lock), flags); __iommu_free(tbl, ret, npages); - spin_unlock_irqrestore(&(tbl->it_lock), flags); + return DMA_ERROR_CODE; } @@ -202,8 +201,6 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - spin_unlock_irqrestore(&(tbl->it_lock), flags); - /* Make sure updates are seen by hardware */ mb(); @@ -244,8 +241,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; spin_lock_irqsave(&(tbl->it_lock), flags); - __iommu_free(tbl, dma_addr, npages); + spin_unlock_irqrestore(&(tbl->it_lock), flags); /* Make sure TLB cache is flushed if the HW needs it. We do * not do an mb() here on purpose, it is not needed on any of @@ -253,8 +250,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, */ if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); } int iommu_map_sg(struct device *dev, struct iommu_table *tbl, -- cgit v1.2.3-70-g09d2 From 67ca141567519a6b0ec81850a7b6569b6d8c2b52 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 3 Jun 2012 19:43:44 +0000 Subject: powerpc/iommu: Reduce spinlock coverage in iommu_free This patch moves tce_free outside of the lock in iommu_free. Some performance numbers were obtained with a Chelsio T3 adapter on two POWER7 boxes, running a 100 session TCP round robin test. Performance improved 25% with this patch applied. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/iommu.c | 51 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/kernel/iommu.c') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 9c8967fa1e6..d855cfc0732 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -190,10 +190,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, * not altered. */ if (unlikely(build_fail)) { - spin_lock_irqsave(&(tbl->it_lock), flags); __iommu_free(tbl, ret, npages); - spin_unlock_irqrestore(&(tbl->it_lock), flags); - return DMA_ERROR_CODE; } @@ -207,8 +204,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, return ret; } -static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) { unsigned long entry, free_entry; @@ -228,21 +225,53 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index); WARN_ON(1); } - return; + + return false; } + return true; +} + +static void __iommu_free_locked(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; + + BUG_ON(!spin_is_locked(&tbl->it_lock)); + + entry = dma_addr >> IOMMU_PAGE_SHIFT; + free_entry = entry - tbl->it_offset; + + if (!iommu_free_check(tbl, dma_addr, npages)) + return; + ppc_md.tce_free(tbl, entry, npages); bitmap_clear(tbl->it_map, free_entry, npages); } -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) { + unsigned long entry, free_entry; unsigned long flags; + entry = dma_addr >> IOMMU_PAGE_SHIFT; + free_entry = entry - tbl->it_offset; + + if (!iommu_free_check(tbl, dma_addr, npages)) + return; + + ppc_md.tce_free(tbl, entry, npages); + spin_lock_irqsave(&(tbl->it_lock), flags); - __iommu_free(tbl, dma_addr, npages); + bitmap_clear(tbl->it_map, free_entry, npages); spin_unlock_irqrestore(&(tbl->it_lock), flags); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + __iommu_free(tbl, dma_addr, npages); /* Make sure TLB cache is flushed if the HW needs it. We do * not do an mb() here on purpose, it is not needed on any of @@ -390,7 +419,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, vaddr = s->dma_address & IOMMU_PAGE_MASK; npages = iommu_num_pages(s->dma_address, s->dma_length, IOMMU_PAGE_SIZE); - __iommu_free(tbl, vaddr, npages); + __iommu_free_locked(tbl, vaddr, npages); s->dma_address = DMA_ERROR_CODE; s->dma_length = 0; } @@ -425,7 +454,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, break; npages = iommu_num_pages(dma_handle, sg->dma_length, IOMMU_PAGE_SIZE); - __iommu_free(tbl, dma_handle, npages); + __iommu_free_locked(tbl, dma_handle, npages); sg = sg_next(sg); } -- cgit v1.2.3-70-g09d2 From d362213722c8875b40d712796392682968ce685e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 3 Jun 2012 19:44:25 +0000 Subject: powerpc/iommu: Push spinlock into iommu_range_alloc and __iommu_free In preparation for IOMMU pools, push the spinlock into iommu_range_alloc and __iommu_free. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/iommu.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) (limited to 'arch/powerpc/kernel/iommu.c') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index d855cfc0732..70a212cec58 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -71,6 +71,7 @@ static unsigned long iommu_range_alloc(struct device *dev, int pass = 0; unsigned long align_mask; unsigned long boundary_size; + unsigned long flags; align_mask = 0xffffffffffffffffl >> (64 - align_order); @@ -83,6 +84,8 @@ static unsigned long iommu_range_alloc(struct device *dev, return DMA_ERROR_CODE; } + spin_lock_irqsave(&(tbl->it_lock), flags); + if (handle && *handle) start = *handle; else @@ -136,6 +139,7 @@ static unsigned long iommu_range_alloc(struct device *dev, goto again; } else { /* Third failure, give up */ + spin_unlock_irqrestore(&(tbl->it_lock), flags); return DMA_ERROR_CODE; } } @@ -156,6 +160,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (handle) *handle = end; + spin_unlock_irqrestore(&(tbl->it_lock), flags); return n; } @@ -165,13 +170,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, unsigned long mask, unsigned int align_order, struct dma_attrs *attrs) { - unsigned long entry, flags; + unsigned long entry; dma_addr_t ret = DMA_ERROR_CODE; int build_fail; - spin_lock_irqsave(&(tbl->it_lock), flags); entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); - spin_unlock_irqrestore(&(tbl->it_lock), flags); if (unlikely(entry == DMA_ERROR_CODE)) return DMA_ERROR_CODE; @@ -232,23 +235,6 @@ static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, return true; } -static void __iommu_free_locked(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) -{ - unsigned long entry, free_entry; - - BUG_ON(!spin_is_locked(&tbl->it_lock)); - - entry = dma_addr >> IOMMU_PAGE_SHIFT; - free_entry = entry - tbl->it_offset; - - if (!iommu_free_check(tbl, dma_addr, npages)) - return; - - ppc_md.tce_free(tbl, entry, npages); - bitmap_clear(tbl->it_map, free_entry, npages); -} - static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { @@ -287,7 +273,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, struct dma_attrs *attrs) { dma_addr_t dma_next = 0, dma_addr; - unsigned long flags; struct scatterlist *s, *outs, *segstart; int outcount, incount, i, build_fail = 0; unsigned int align; @@ -309,8 +294,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, DBG("sg mapping %d elements:\n", nelems); - spin_lock_irqsave(&(tbl->it_lock), flags); - max_seg_size = dma_get_max_seg_size(dev); for_each_sg(sglist, s, nelems, i) { unsigned long vaddr, npages, entry, slen; @@ -393,8 +376,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - spin_unlock_irqrestore(&(tbl->it_lock), flags); - DBG("mapped %d elements:\n", outcount); /* For the sake of iommu_unmap_sg, we clear out the length in the @@ -419,14 +400,13 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, vaddr = s->dma_address & IOMMU_PAGE_MASK; npages = iommu_num_pages(s->dma_address, s->dma_length, IOMMU_PAGE_SIZE); - __iommu_free_locked(tbl, vaddr, npages); + __iommu_free(tbl, vaddr, npages); s->dma_address = DMA_ERROR_CODE; s->dma_length = 0; } if (s == outs) break; } - spin_unlock_irqrestore(&(tbl->it_lock), flags); return 0; } @@ -436,15 +416,12 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, struct dma_attrs *attrs) { struct scatterlist *sg; - unsigned long flags; BUG_ON(direction == DMA_NONE); if (!tbl) return; - spin_lock_irqsave(&(tbl->it_lock), flags); - sg = sglist; while (nelems--) { unsigned int npages; @@ -454,7 +431,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, break; npages = iommu_num_pages(dma_handle, sg->dma_length, IOMMU_PAGE_SIZE); - __iommu_free_locked(tbl, dma_handle, npages); + __iommu_free(tbl, dma_handle, npages); sg = sg_next(sg); } @@ -464,8 +441,6 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, */ if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); } static void iommu_table_clear(struct iommu_table *tbl) -- cgit v1.2.3-70-g09d2 From b4c3a8729ae57b4f84d661e16a192f828eca1d03 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 7 Jun 2012 18:14:48 +0000 Subject: powerpc/iommu: Implement IOMMU pools to improve multiqueue adapter performance At the moment all queues in a multiqueue adapter will serialise against the IOMMU table lock. This is proving to be a big issue, especially with 10Gbit ethernet. This patch creates 4 pools and tries to spread the load across them. If the table is under 1GB in size we revert back to the original behaviour of 1 pool and 1 largealloc pool. We create a hash to map CPUs to pools. Since we prefer interrupts to be affinitised to primary CPUs, without some form of hashing we are very likely to end up using the same pool. As an example, POWER7 has 4 way SMT and with 4 pools all primary threads will map to the same pool. The largealloc pool is reduced from 1/2 to 1/4 of the space to partially offset the overhead of breaking the table up into pools. Some performance numbers were obtained with a Chelsio T3 adapter on two POWER7 boxes, running a 100 session TCP round robin test. Performance improved 69% with this patch applied. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/iommu.h | 18 ++++- arch/powerpc/kernel/iommu.c | 148 +++++++++++++++++++++++++++--------- arch/powerpc/platforms/cell/iommu.c | 1 - 3 files changed, 128 insertions(+), 39 deletions(-) (limited to 'arch/powerpc/kernel/iommu.c') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 957a83f4364..cbfe678e3db 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -53,6 +53,16 @@ static __inline__ __attribute_const__ int get_iommu_order(unsigned long size) */ #define IOMAP_MAX_ORDER 13 +#define IOMMU_POOL_HASHBITS 2 +#define IOMMU_NR_POOLS (1 << IOMMU_POOL_HASHBITS) + +struct iommu_pool { + unsigned long start; + unsigned long end; + unsigned long hint; + spinlock_t lock; +} ____cacheline_aligned_in_smp; + struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ @@ -61,10 +71,10 @@ struct iommu_table { unsigned long it_index; /* which iommu table this is */ unsigned long it_type; /* type: PCI or Virtual Bus */ unsigned long it_blocksize; /* Entries in each block (cacheline) */ - unsigned long it_hint; /* Hint for next alloc */ - unsigned long it_largehint; /* Hint for large allocs */ - unsigned long it_halfpoint; /* Breaking point for small/large allocs */ - spinlock_t it_lock; /* Protects it_map */ + unsigned long poolsize; + unsigned long nr_pools; + struct iommu_pool large_pool; + struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ }; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 70a212cec58..7bc94da1a83 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,26 @@ static int __init setup_iommu(char *str) __setup("iommu=", setup_iommu); +static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); + +/* + * We precalculate the hash to avoid doing it on every allocation. + * + * The hash is important to spread CPUs across all the pools. For example, + * on a POWER7 with 4 way SMT we want interrupts on the primary threads and + * with 4 pools all primary threads would map to the same pool. + */ +static int __init setup_iommu_pool_hash(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS); + + return 0; +} +subsys_initcall(setup_iommu_pool_hash); + static unsigned long iommu_range_alloc(struct device *dev, struct iommu_table *tbl, unsigned long npages, @@ -72,6 +93,8 @@ static unsigned long iommu_range_alloc(struct device *dev, unsigned long align_mask; unsigned long boundary_size; unsigned long flags; + unsigned int pool_nr; + struct iommu_pool *pool; align_mask = 0xffffffffffffffffl >> (64 - align_order); @@ -84,38 +107,46 @@ static unsigned long iommu_range_alloc(struct device *dev, return DMA_ERROR_CODE; } - spin_lock_irqsave(&(tbl->it_lock), flags); + /* + * We don't need to disable preemption here because any CPU can + * safely use any IOMMU pool. + */ + pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1); - if (handle && *handle) - start = *handle; + if (largealloc) + pool = &(tbl->large_pool); else - start = largealloc ? tbl->it_largehint : tbl->it_hint; + pool = &(tbl->pools[pool_nr]); - /* Use only half of the table for small allocs (15 pages or less) */ - limit = largealloc ? tbl->it_size : tbl->it_halfpoint; + spin_lock_irqsave(&(pool->lock), flags); + +again: + if ((pass == 0) && handle && *handle) + start = *handle; + else + start = pool->hint; - if (largealloc && start < tbl->it_halfpoint) - start = tbl->it_halfpoint; + limit = pool->end; /* The case below can happen if we have a small segment appended * to a large, or when the previous alloc was at the very end of * the available space. If so, go back to the initial start. */ if (start >= limit) - start = largealloc ? tbl->it_largehint : tbl->it_hint; - - again: + start = pool->start; if (limit + tbl->it_offset > mask) { limit = mask - tbl->it_offset + 1; /* If we're constrained on address range, first try * at the masked hint to avoid O(n) search complexity, - * but on second pass, start at 0. + * but on second pass, start at 0 in pool 0. */ - if ((start & mask) >= limit || pass > 0) - start = 0; - else + if ((start & mask) >= limit || pass > 0) { + pool = &(tbl->pools[0]); + start = pool->start; + } else { start &= mask; + } } if (dev) @@ -129,17 +160,25 @@ static unsigned long iommu_range_alloc(struct device *dev, tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT, align_mask); if (n == -1) { - if (likely(pass < 2)) { - /* First failure, just rescan the half of the table. - * Second failure, rescan the other half of the table. - */ - start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; - limit = pass ? tbl->it_size : limit; + if (likely(pass == 0)) { + /* First try the pool from the start */ + pool->hint = pool->start; pass++; goto again; + + } else if (pass <= tbl->nr_pools) { + /* Now try scanning all the other pools */ + spin_unlock(&(pool->lock)); + pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); + pool = &tbl->pools[pool_nr]; + spin_lock(&(pool->lock)); + pool->hint = pool->start; + pass++; + goto again; + } else { - /* Third failure, give up */ - spin_unlock_irqrestore(&(tbl->it_lock), flags); + /* Give up */ + spin_unlock_irqrestore(&(pool->lock), flags); return DMA_ERROR_CODE; } } @@ -149,10 +188,10 @@ static unsigned long iommu_range_alloc(struct device *dev, /* Bump the hint to a new block for small allocs. */ if (largealloc) { /* Don't bump to new block to avoid fragmentation */ - tbl->it_largehint = end; + pool->hint = end; } else { /* Overflow will be taken care of at the next allocation */ - tbl->it_hint = (end + tbl->it_blocksize - 1) & + pool->hint = (end + tbl->it_blocksize - 1) & ~(tbl->it_blocksize - 1); } @@ -160,7 +199,8 @@ static unsigned long iommu_range_alloc(struct device *dev, if (handle) *handle = end; - spin_unlock_irqrestore(&(tbl->it_lock), flags); + spin_unlock_irqrestore(&(pool->lock), flags); + return n; } @@ -235,23 +275,45 @@ static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, return true; } +static struct iommu_pool *get_pool(struct iommu_table *tbl, + unsigned long entry) +{ + struct iommu_pool *p; + unsigned long largepool_start = tbl->large_pool.start; + + /* The large pool is the last pool at the top of the table */ + if (entry >= largepool_start) { + p = &tbl->large_pool; + } else { + unsigned int pool_nr = entry / tbl->poolsize; + + BUG_ON(pool_nr > tbl->nr_pools); + p = &tbl->pools[pool_nr]; + } + + return p; +} + static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry, free_entry; unsigned long flags; + struct iommu_pool *pool; entry = dma_addr >> IOMMU_PAGE_SHIFT; free_entry = entry - tbl->it_offset; + pool = get_pool(tbl, free_entry); + if (!iommu_free_check(tbl, dma_addr, npages)) return; ppc_md.tce_free(tbl, entry, npages); - spin_lock_irqsave(&(tbl->it_lock), flags); + spin_lock_irqsave(&(pool->lock), flags); bitmap_clear(tbl->it_map, free_entry, npages); - spin_unlock_irqrestore(&(tbl->it_lock), flags); + spin_unlock_irqrestore(&(pool->lock), flags); } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -493,9 +555,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) unsigned long sz; static int welcomed = 0; struct page *page; - - /* Set aside 1/4 of the table for large allocations. */ - tbl->it_halfpoint = tbl->it_size * 3 / 4; + unsigned int i; + struct iommu_pool *p; /* number of bytes needed for the bitmap */ sz = (tbl->it_size + 7) >> 3; @@ -514,9 +575,28 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - tbl->it_hint = 0; - tbl->it_largehint = tbl->it_halfpoint; - spin_lock_init(&tbl->it_lock); + /* We only split the IOMMU table if we have 1GB or more of space */ + if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024)) + tbl->nr_pools = IOMMU_NR_POOLS; + else + tbl->nr_pools = 1; + + /* We reserve the top 1/4 of the table for large allocations */ + tbl->poolsize = (tbl->it_size * 3 / 4) / IOMMU_NR_POOLS; + + for (i = 0; i < IOMMU_NR_POOLS; i++) { + p = &tbl->pools[i]; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = p->start + tbl->poolsize; + } + + p = &tbl->large_pool; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = tbl->it_size; iommu_table_clear(tbl); diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index b9f509a34c0..c264969c931 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -518,7 +518,6 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, __set_bit(0, window->table.it_map); tce_build_cell(&window->table, window->table.it_offset, 1, (unsigned long)iommu->pad_page, DMA_TO_DEVICE, NULL); - window->table.it_hint = window->table.it_blocksize; return window; } -- cgit v1.2.3-70-g09d2 From d6b9a81b2a45786384f5bd3516bd6ddfb4b772c6 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 24 Jun 2012 18:26:17 +0000 Subject: powerpc: IOMMU fault injection Add the ability to inject IOMMU faults. We enable this per device via a fail_iommu sysfs property, similar to fault injection on other subsystems. An example: ... 0003:01:00.1 Ethernet controller: Emulex Corporation OneConnect 10Gb NIC (be3) (rev 02) To inject one error to this device: echo 1 > /sys/bus/pci/devices/0003:01:00.1/fail_iommu echo 1 > /sys/kernel/debug/fail_iommu/probability echo 1 > /sys/kernel/debug/fail_iommu/times As feared, the first failure injected on the be3 results in an unrecoverable error, taking down both functions of the card permanently: be2net 0003:01:00.1: Unrecoverable error in the card Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig.debug | 9 ++++ arch/powerpc/include/asm/device.h | 3 ++ arch/powerpc/kernel/iommu.c | 94 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) (limited to 'arch/powerpc/kernel/iommu.c') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index e5f26890a69..5416e28a753 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -331,4 +331,13 @@ config STRICT_DEVMEM If you are unsure, say Y. +config FAIL_IOMMU + bool "Fault-injection capability for IOMMU" + depends on FAULT_INJECTION + help + Provide fault-injection capability for IOMMU. Each device can + be selectively enabled via the fail_iommu property. + + If you are unsure, say N. + endmenu diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 63d5ca49cec..77e97dd0c15 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -34,6 +34,9 @@ struct dev_archdata { #ifdef CONFIG_EEH struct eeh_dev *edev; #endif +#ifdef CONFIG_FAIL_IOMMU + int fail_iommu; +#endif }; struct pdev_archdata { diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 7bc94da1a83..fbefe729df3 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include #include @@ -41,6 +43,7 @@ #include #include #include +#include #define DBG(...) @@ -79,6 +82,94 @@ static int __init setup_iommu_pool_hash(void) } subsys_initcall(setup_iommu_pool_hash); +#ifdef CONFIG_FAIL_IOMMU + +static DECLARE_FAULT_ATTR(fail_iommu); + +static int __init setup_fail_iommu(char *str) +{ + return setup_fault_attr(&fail_iommu, str); +} +__setup("fail_iommu=", setup_fail_iommu); + +static bool should_fail_iommu(struct device *dev) +{ + return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1); +} + +static int __init fail_iommu_debugfs(void) +{ + struct dentry *dir = fault_create_debugfs_attr("fail_iommu", + NULL, &fail_iommu); + + return IS_ERR(dir) ? PTR_ERR(dir) : 0; +} +late_initcall(fail_iommu_debugfs); + +static ssize_t fail_iommu_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev->archdata.fail_iommu); +} + +static ssize_t fail_iommu_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + dev->archdata.fail_iommu = (i == 0) ? 0 : 1; + + return count; +} + +static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show, + fail_iommu_store); + +static int fail_iommu_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + if (action == BUS_NOTIFY_ADD_DEVICE) { + if (device_create_file(dev, &dev_attr_fail_iommu)) + pr_warn("Unable to create IOMMU fault injection sysfs " + "entries\n"); + } else if (action == BUS_NOTIFY_DEL_DEVICE) { + device_remove_file(dev, &dev_attr_fail_iommu); + } + + return 0; +} + +static struct notifier_block fail_iommu_bus_notifier = { + .notifier_call = fail_iommu_bus_notify +}; + +static int __init fail_iommu_setup(void) +{ +#ifdef CONFIG_PCI + bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier); +#endif +#ifdef CONFIG_IBMVIO + bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier); +#endif + + return 0; +} +/* + * Must execute after PCI and VIO subsystem have initialised but before + * devices are probed. + */ +arch_initcall(fail_iommu_setup); +#else +static inline bool should_fail_iommu(struct device *dev) +{ + return false; +} +#endif + static unsigned long iommu_range_alloc(struct device *dev, struct iommu_table *tbl, unsigned long npages, @@ -107,6 +198,9 @@ static unsigned long iommu_range_alloc(struct device *dev, return DMA_ERROR_CODE; } + if (should_fail_iommu(dev)) + return DMA_ERROR_CODE; + /* * We don't need to disable preemption here because any CPU can * safely use any IOMMU pool. -- cgit v1.2.3-70-g09d2 From dcd261ba58d7a49f4624b34b3bd2d535f63643a2 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 13 Jul 2012 17:45:49 +1000 Subject: powerpc/iommu: Fix iommu pool initialization The iommu pool patch has a bug where it would cause a crash when using only one pool (based on the size of the DMA window). Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel/iommu.c') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index fbefe729df3..ff5a6ce027b 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -676,9 +676,9 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) tbl->nr_pools = 1; /* We reserve the top 1/4 of the table for large allocations */ - tbl->poolsize = (tbl->it_size * 3 / 4) / IOMMU_NR_POOLS; + tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; - for (i = 0; i < IOMMU_NR_POOLS; i++) { + for (i = 0; i < tbl->nr_pools; i++) { p = &tbl->pools[i]; spin_lock_init(&(p->lock)); p->start = tbl->poolsize * i; -- cgit v1.2.3-70-g09d2