diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-18 06:31:43 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-18 06:31:43 -0700 |
commit | 0a95d92c0054e74fb79607ac2df958b7bf295706 (patch) | |
tree | e2c5f836e799dcfd72904949be47595af91432e7 /arch/powerpc/platforms/pseries | |
parent | 08351fc6a75731226e1112fc7254542bd3a2912e (diff) | |
parent | 831532035b12a5f7b600515a6f4da0b207b82d6e (diff) |
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (62 commits)
powerpc/85xx: Fix signedness bug in cache-sram
powerpc/fsl: 85xx: document cache sram bindings
powerpc/fsl: define binding for fsl mpic interrupt controllers
powerpc/fsl_msi: Handle msi-available-ranges better
drivers/serial/ucc_uart.c: Add of_node_put to avoid memory leak
powerpc/85xx: Fix SPE float to integer conversion failure
powerpc/85xx: Update sata controller compatible for p1022ds board
ATA: Add FSL sata v2 controller support
powerpc/mpc8xxx_gpio: simplify searching for 'fsl, qoriq-gpio' compatiable
powerpc/8xx: remove obsolete mgsuvd board
powerpc/82xx: rename and update mgcoge board support
powerpc/83xx: rename and update kmeter1
powerpc/85xx: Workaroudn e500 CPU erratum A005
powerpc/fsl_pci: Add support for FSL PCIe controllers v2.x
powerpc/85xx: Fix writing to spin table 'cpu-release-addr' on ppc64e
powerpc/pseries: Disable MSI using new interface if possible
powerpc: Enable GENERIC_HARDIRQS_NO_DEPRECATED.
powerpc: core irq_data conversion.
powerpc: sysdev/xilinx_intc irq_data conversion.
powerpc: sysdev/uic irq_data conversion.
...
Fix up conflicts in arch/powerpc/sysdev/fsl_msi.c (due to getting rid of
of_platform_driver in arch/powerpc)
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r-- | arch/powerpc/platforms/pseries/cmm.c | 14 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/iommu.c | 587 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/msi.c | 14 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/nvram.c | 255 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/setup.c | 5 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/xics.c | 89 |
7 files changed, 849 insertions, 117 deletions
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index f4803868642..3cafc306b97 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -508,12 +508,7 @@ static int cmm_memory_isolate_cb(struct notifier_block *self, if (action == MEM_ISOLATE_COUNT) ret = cmm_count_pages(arg); - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } static struct notifier_block cmm_mem_isolate_nb = { @@ -635,12 +630,7 @@ static int cmm_memory_cb(struct notifier_block *self, break; } - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } static struct notifier_block cmm_mem_nb = { diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 17a11c82e6f..3cc4d102b1f 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -876,7 +876,7 @@ void eeh_restore_bars(struct pci_dn *pdn) * * Save the values of the device bars. Unlike the restore * routine, this routine is *not* recursive. This is because - * PCI devices are added individuallly; but, for the restore, + * PCI devices are added individually; but, for the restore, * an entire slot is reset at a time. */ static void eeh_save_bars(struct pci_dn *pdn) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index edea60b7ee9..154c464cdca 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -33,6 +33,7 @@ #include <linux/pci.h> #include <linux/dma-mapping.h> #include <linux/crash_dump.h> +#include <linux/memory.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> @@ -45,6 +46,7 @@ #include <asm/tce.h> #include <asm/ppc-pci.h> #include <asm/udbg.h> +#include <asm/mmzone.h> #include "plpar_wrappers.h" @@ -270,6 +272,152 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) return tce_ret; } +/* this is compatable with cells for the device tree property */ +struct dynamic_dma_window_prop { + __be32 liobn; /* tce table number */ + __be64 dma_base; /* address hi,lo */ + __be32 tce_shift; /* ilog2(tce_page_size) */ + __be32 window_shift; /* ilog2(tce_window_size) */ +}; + +struct direct_window { + struct device_node *device; + const struct dynamic_dma_window_prop *prop; + struct list_head list; +}; + +/* Dynamic DMA Window support */ +struct ddw_query_response { + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; +}; + +struct ddw_create_response { + u32 liobn; + u32 addr_hi; + u32 addr_lo; +}; + +static LIST_HEAD(direct_window_list); +/* prevents races between memory on/offline and window creation */ +static DEFINE_SPINLOCK(direct_window_list_lock); +/* protects initializing window twice for same device */ +static DEFINE_MUTEX(direct_window_init_mutex); +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" + +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), + dma_offset, + 0, limit); + num_tce -= limit; + } while (num_tce > 0 && !rc); + + return rc; +} + +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable(); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange->liobn); + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + for (l = 0; l < limit; l++) { + tcep[l] = proto_tce | next; + next += tce_size; + } + + rc = plpar_tce_put_indirect(liobn, + dma_offset, + (u64)virt_to_abs(tcep), + limit); + + num_tce -= limit; + } while (num_tce > 0 && !rc); + + /* error cleanup: caller will clear whole range */ + + local_irq_enable(); + return rc; +} + +static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, + unsigned long num_pfn, void *arg) +{ + return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); +} + + #ifdef CONFIG_PCI static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, @@ -495,6 +643,329 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) pci_name(dev)); } +static int __read_mostly disable_ddw; + +static int __init disable_ddw_setup(char *str) +{ + disable_ddw = 1; + printk(KERN_INFO "ppc iommu: disabling ddw.\n"); + + return 0; +} + +early_param("disable_ddw", disable_ddw_setup); + +static void remove_ddw(struct device_node *np) +{ + struct dynamic_dma_window_prop *dwp; + struct property *win64; + const u32 *ddr_avail; + u64 liobn; + int len, ret; + + ddr_avail = of_get_property(np, "ibm,ddw-applicable", &len); + win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); + if (!win64 || !ddr_avail || len < 3 * sizeof(u32)) + return; + + dwp = win64->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + /* clear the whole window, note the arg is in kernel pages */ + ret = tce_clearrange_multi_pSeriesLP(0, + 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); + if (ret) + pr_warning("%s failed to clear tces in window.\n", + np->full_name); + else + pr_debug("%s successfully cleared tces in window.\n", + np->full_name); + + ret = rtas_call(ddr_avail[2], 1, 1, NULL, liobn); + if (ret) + pr_warning("%s: failed to remove direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); + else + pr_debug("%s: successfully removed direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); +} + + +static int dupe_ddw_if_already_created(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + spin_lock(&direct_window_list_lock); + /* check if we already created a window and dupe that config if so */ + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == pdn) { + direct64 = window->prop; + dma_addr = direct64->dma_base; + break; + } + } + spin_unlock(&direct_window_list_lock); + + return dma_addr; +} + +static u64 dupe_ddw_if_kexec(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + int len; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); + if (direct64) { + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) { + remove_ddw(pdn); + } else { + window->device = pdn; + window->prop = direct64; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + dma_addr = direct64->dma_base; + } + } + + return dma_addr; +} + +static int query_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_query_response *query) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + ret = rtas_call(ddr_avail[0], 3, 5, (u32 *)query, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" + " returned %d\n", ddr_avail[0], cfg_addr, BUID_HI(buid), + BUID_LO(buid), ret); + return ret; +} + +static int create_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_create_response *create, int page_shift, + int window_shift) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + + do { + /* extra outputs are LIOBN and dma-addr (hi, lo) */ + ret = rtas_call(ddr_avail[1], 5, 4, (u32 *)create, cfg_addr, + BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + } while (rtas_busy_delay(ret)); + dev_info(&dev->dev, + "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " + "(liobn = 0x%x starting addr = %x %x)\n", ddr_avail[1], + cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift, + window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + + return ret; +} + +/* + * If the PE supports dynamic dma windows, and there is space for a table + * that can map all pages in a linear offset, then setup such a table, + * and record the dma-offset in the struct device. + * + * dev: the pci device we are checking + * pdn: the parent pe node with the ibm,dma_window property + * Future: also check if we can remap the base window for our base page size + * + * returns the dma offset for use by dma_set_mask + */ +static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +{ + int len, ret; + struct ddw_query_response query; + struct ddw_create_response create; + int page_shift; + u64 dma_addr, max_addr; + struct device_node *dn; + const u32 *uninitialized_var(ddr_avail); + struct direct_window *window; + struct property *uninitialized_var(win64); + struct dynamic_dma_window_prop *ddwprop; + + mutex_lock(&direct_window_init_mutex); + + dma_addr = dupe_ddw_if_already_created(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + dma_addr = dupe_ddw_if_kexec(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + /* + * the ibm,ddw-applicable property holds the tokens for: + * ibm,query-pe-dma-window + * ibm,create-pe-dma-window + * ibm,remove-pe-dma-window + * for the given node in that order. + * the property is actually in the parent, not the PE + */ + ddr_avail = of_get_property(pdn, "ibm,ddw-applicable", &len); + if (!ddr_avail || len < 3 * sizeof(u32)) + goto out_unlock; + + /* + * Query if there is a second window of size to map the + * whole partition. Query returns number of windows, largest + * block assigned to PE (partition endpoint), and two bitmasks + * of page sizes: supported and supported for migrate-dma. + */ + dn = pci_device_to_OF_node(dev); + ret = query_ddw(dev, ddr_avail, &query); + if (ret != 0) + goto out_unlock; + + if (query.windows_available == 0) { + /* + * no additional windows are available for this device. + * We might be able to reallocate the existing window, + * trading in for a larger page size. + */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_unlock; + } + if (query.page_size & 4) { + page_shift = 24; /* 16MB */ + } else if (query.page_size & 2) { + page_shift = 16; /* 64kB */ + } else if (query.page_size & 1) { + page_shift = 12; /* 4kB */ + } else { + dev_dbg(&dev->dev, "no supported direct page size in mask %x", + query.page_size); + goto out_unlock; + } + /* verify the window * number of ptes will map the partition */ + /* check largest block * page size > max memory hotplug addr */ + max_addr = memory_hotplug_max(); + if (query.largest_available_block < (max_addr >> page_shift)) { + dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u " + "%llu-sized pages\n", max_addr, query.largest_available_block, + 1ULL << page_shift); + goto out_unlock; + } + len = order_base_2(max_addr); + win64 = kzalloc(sizeof(struct property), GFP_KERNEL); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property for 64bit dma window\n"); + goto out_unlock; + } + win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); + win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); + if (!win64->name || !win64->value) { + dev_info(&dev->dev, + "couldn't allocate property name and value\n"); + goto out_free_prop; + } + + ret = create_ddw(dev, ddr_avail, &create, page_shift, len); + if (ret != 0) + goto out_free_prop; + + ddwprop->liobn = cpu_to_be32(create.liobn); + ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2)); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(len); + + dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n", + create.liobn, dn->full_name); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_clear_window; + + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %s: %d\n", + dn->full_name, ret); + goto out_clear_window; + } + + ret = prom_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %s: %d", + pdn->full_name, ret); + goto out_clear_window; + } + + window->device = pdn; + window->prop = ddwprop; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + + dma_addr = of_read_number(&create.addr_hi, 2); + goto out_unlock; + +out_clear_window: + remove_ddw(pdn); + +out_free_prop: + kfree(win64->name); + kfree(win64->value); + kfree(win64); + +out_unlock: + mutex_unlock(&direct_window_init_mutex); + return dma_addr; +} + static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; @@ -541,23 +1012,137 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) set_iommu_table_base(&dev->dev, pci->iommu_table); } + +static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +{ + bool ddw_enabled = false; + struct device_node *pdn, *dn; + struct pci_dev *pdev; + const void *dma_window = NULL; + u64 dma_offset; + + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + + /* only attempt to use a new window if 64-bit DMA is requested */ + if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { + pdev = to_pci_dev(dev); + + dn = pci_device_to_OF_node(pdev); + dev_dbg(dev, "node is %s\n", dn->full_name); + + /* + * the device tree might contain the dma-window properties + * per-device and not neccesarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + if (pdn && PCI_DN(pdn)) { + dma_offset = enable_ddw(pdev, pdn); + if (dma_offset != 0) { + dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); + set_dma_offset(dev, dma_offset); + set_dma_ops(dev, &dma_direct_ops); + ddw_enabled = true; + } + } + } + + /* fall-through to iommu ops */ + if (!ddw_enabled) { + dev_info(dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(dev, &dma_iommu_ops); + } + + *dev->dma_mask = dma_mask; + return 0; +} + #else /* CONFIG_PCI */ #define pci_dma_bus_setup_pSeries NULL #define pci_dma_dev_setup_pSeries NULL #define pci_dma_bus_setup_pSeriesLP NULL #define pci_dma_dev_setup_pSeriesLP NULL +#define dma_set_mask_pSeriesLP NULL #endif /* !CONFIG_PCI */ +static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct direct_window *window; + struct memory_notify *arg = data; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + default: + break; + } + if (ret && action != MEM_CANCEL_ONLINE) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static struct notifier_block iommu_mem_nb = { + .notifier_call = iommu_mem_notifier, +}; + static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) { int err = NOTIFY_OK; struct device_node *np = node; struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; switch (action) { case PSERIES_RECONFIG_REMOVE: if (pci && pci->iommu_table) iommu_free_table(pci->iommu_table, np->full_name); + + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == np) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&direct_window_list_lock); + + /* + * Because the notifier runs after isolation of the + * slot, we are guaranteed any DMA window has already + * been revoked and the TCEs have been marked invalid, + * so we don't need a call to remove_ddw(np). However, + * if an additional notifier action is added before the + * isolate call, we should update this code for + * completeness with such a call. + */ break; default: err = NOTIFY_DONE; @@ -587,6 +1172,7 @@ void iommu_init_early_pSeries(void) ppc_md.tce_get = tce_get_pSeriesLP; ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; + ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; } else { ppc_md.tce_build = tce_build_pSeries; ppc_md.tce_free = tce_free_pSeries; @@ -597,6 +1183,7 @@ void iommu_init_early_pSeries(void) pSeries_reconfig_notifier_register(&iommu_reconfig_nb); + register_memory_notifier(&iommu_mem_nb); set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 1164c3430f2..18ac801f8e9 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -93,8 +93,18 @@ static void rtas_disable_msi(struct pci_dev *pdev) if (!pdn) return; - if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) - pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + /* + * disabling MSI with the explicit interface also disables MSI-X + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) { + /* + * may have failed because explicit interface is not + * present + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) { + pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + } + } } static int rtas_query_irq_number(struct pci_dn *pdn, int offset) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 7e828ba29bc..419707b0724 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -16,6 +16,8 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/kmsg_dump.h> #include <asm/uaccess.h> #include <asm/nvram.h> #include <asm/rtas.h> @@ -30,17 +32,54 @@ static int nvram_fetch, nvram_store; static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ static DEFINE_SPINLOCK(nvram_lock); -static long nvram_error_log_index = -1; -static long nvram_error_log_size = 0; - struct err_log_info { int error_type; unsigned int seq_num; }; -#define NVRAM_MAX_REQ 2079 -#define NVRAM_MIN_REQ 1055 -#define NVRAM_LOG_PART_NAME "ibm,rtas-log" +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ +}; + +static struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1 +}; + +static struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1 +}; + +static const char *pseries_nvram_os_partitions[] = { + "ibm,rtas-log", + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* See clobbering_unread_rtas_event() */ +#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ +static unsigned long last_unread_rtas_event; /* timestamp */ + +/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */ +static char *oops_buf; static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { @@ -134,7 +173,7 @@ static ssize_t pSeries_nvram_get_size(void) } -/* nvram_write_error_log +/* nvram_write_os_partition, nvram_write_error_log * * We need to buffer the error logs into nvram to ensure that we have * the failure information to decode. If we have a severe error there @@ -156,48 +195,58 @@ static ssize_t pSeries_nvram_get_size(void) * The 'data' section would look like (in bytes): * +--------------+------------+-----------------------------------+ * | event_logged | sequence # | error log | - * |0 3|4 7|8 nvram_error_log_size-1| + * |0 3|4 7|8 error_log_size-1| * +--------------+------------+-----------------------------------+ * * event_logged: 0 if event has not been logged to syslog, 1 if it has * sequence #: The unique sequence # for each event. (until it wraps) * error log: The error log from event_scan */ -int nvram_write_error_log(char * buff, int length, - unsigned int err_type, unsigned int error_log_cnt) +int nvram_write_os_partition(struct nvram_os_partition *part, char * buff, + int length, unsigned int err_type, unsigned int error_log_cnt) { int rc; loff_t tmp_index; struct err_log_info info; - if (nvram_error_log_index == -1) { + if (part->index == -1) { return -ESPIPE; } - if (length > nvram_error_log_size) { - length = nvram_error_log_size; + if (length > part->size) { + length = part->size; } info.error_type = err_type; info.seq_num = error_log_cnt; - tmp_index = nvram_error_log_index; + tmp_index = part->index; rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); return rc; } rc = ppc_md.nvram_write(buff, length, &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); return rc; } return 0; } +int nvram_write_error_log(char * buff, int length, + unsigned int err_type, unsigned int error_log_cnt) +{ + int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); + if (!rc) + last_unread_rtas_event = get_seconds(); + return rc; +} + /* nvram_read_error_log * * Reads nvram for error log for at most 'length' @@ -209,13 +258,13 @@ int nvram_read_error_log(char * buff, int length, loff_t tmp_index; struct err_log_info info; - if (nvram_error_log_index == -1) + if (rtas_log_partition.index == -1) return -1; - if (length > nvram_error_log_size) - length = nvram_error_log_size; + if (length > rtas_log_partition.size) + length = rtas_log_partition.size; - tmp_index = nvram_error_log_index; + tmp_index = rtas_log_partition.index; rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); if (rc <= 0) { @@ -244,37 +293,40 @@ int nvram_clear_error_log(void) int clear_word = ERR_FLAG_ALREADY_LOGGED; int rc; - if (nvram_error_log_index == -1) + if (rtas_log_partition.index == -1) return -1; - tmp_index = nvram_error_log_index; + tmp_index = rtas_log_partition.index; rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); if (rc <= 0) { printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); return rc; } + last_unread_rtas_event = 0; return 0; } -/* pseries_nvram_init_log_partition +/* pseries_nvram_init_os_partition * - * This will setup the partition we need for buffering the - * error logs and cleanup partitions if needed. + * This sets up a partition with an "OS" signature. * * The general strategy is the following: - * 1.) If there is log partition large enough then use it. - * 2.) If there is none large enough, search - * for a free partition that is large enough. - * 3.) If there is not a free partition large enough remove - * _all_ OS partitions and consolidate the space. - * 4.) Will first try getting a chunk that will satisfy the maximum - * error log size (NVRAM_MAX_REQ). - * 5.) If the max chunk cannot be allocated then try finding a chunk - * that will satisfy the minum needed (NVRAM_MIN_REQ). + * 1.) If a partition with the indicated name already exists... + * - If it's large enough, use it. + * - Otherwise, recycle it and keep going. + * 2.) Search for a free partition that is large enough. + * 3.) If there's not a free partition large enough, recycle any obsolete + * OS partitions and try again. + * 4.) Will first try getting a chunk that will satisfy the requested size. + * 5.) If a chunk of the requested size cannot be allocated, then try finding + * a chunk that will satisfy the minum needed. + * + * Returns 0 on success, else -1. */ -static int __init pseries_nvram_init_log_partition(void) +static int __init pseries_nvram_init_os_partition(struct nvram_os_partition + *part) { loff_t p; int size; @@ -282,47 +334,76 @@ static int __init pseries_nvram_init_log_partition(void) /* Scan nvram for partitions */ nvram_scan_partitions(); - /* Lookg for ours */ - p = nvram_find_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS, &size); + /* Look for ours */ + p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size); /* Found one but too small, remove it */ - if (p && size < NVRAM_MIN_REQ) { - pr_info("nvram: Found too small "NVRAM_LOG_PART_NAME" partition" - ",removing it..."); - nvram_remove_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS); + if (p && size < part->min_size) { + pr_info("nvram: Found too small %s partition," + " removing it...\n", part->name); + nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL); p = 0; } /* Create one if we didn't find */ if (!p) { - p = nvram_create_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS, - NVRAM_MAX_REQ, NVRAM_MIN_REQ); - /* No room for it, try to get rid of any OS partition - * and try again - */ + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); if (p == -ENOSPC) { - pr_info("nvram: No room to create "NVRAM_LOG_PART_NAME - " partition, deleting all OS partitions..."); - nvram_remove_partition(NULL, NVRAM_SIG_OS); - p = nvram_create_partition(NVRAM_LOG_PART_NAME, - NVRAM_SIG_OS, NVRAM_MAX_REQ, - NVRAM_MIN_REQ); + pr_info("nvram: No room to create %s partition, " + "deleting any obsolete OS partitions...\n", + part->name); + nvram_remove_partition(NULL, NVRAM_SIG_OS, + pseries_nvram_os_partitions); + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); } } if (p <= 0) { - pr_err("nvram: Failed to find or create "NVRAM_LOG_PART_NAME - " partition, err %d\n", (int)p); - return 0; + pr_err("nvram: Failed to find or create %s" + " partition, err %d\n", part->name, (int)p); + return -1; } - nvram_error_log_index = p; - nvram_error_log_size = nvram_get_partition_size(p) - - sizeof(struct err_log_info); + part->index = p; + part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info); return 0; } -machine_arch_initcall(pseries, pseries_nvram_init_log_partition); + +static void __init nvram_init_oops_partition(int rtas_partition_exists) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&oops_log_partition); + if (rc != 0) { + if (!rtas_partition_exists) + return; + pr_notice("nvram: Using %s partition to log both" + " RTAS errors and oops/panic reports\n", + rtas_log_partition.name); + memcpy(&oops_log_partition, &rtas_log_partition, + sizeof(rtas_log_partition)); + } + oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL); + rc = kmsg_dump_register(&nvram_kmsg_dumper); + if (rc != 0) { + pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); + kfree(oops_buf); + return; + } +} + +static int __init pseries_nvram_init_log_partitions(void) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&rtas_log_partition); + nvram_init_oops_partition(rc == 0); + return 0; +} +machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); int __init pSeries_nvram_init(void) { @@ -353,3 +434,59 @@ int __init pSeries_nvram_init(void) return 0; } + +/* + * Try to capture the last capture_len bytes of the printk buffer. Return + * the amount actually captured. + */ +static size_t capture_last_msgs(const char *old_msgs, size_t old_len, + const char *new_msgs, size_t new_len, + char *captured, size_t capture_len) +{ + if (new_len >= capture_len) { + memcpy(captured, new_msgs + (new_len - capture_len), + capture_len); + return capture_len; + } else { + /* Grab the end of old_msgs. */ + size_t old_tail_len = min(old_len, capture_len - new_len); + memcpy(captured, old_msgs + (old_len - old_tail_len), + old_tail_len); + memcpy(captured + old_tail_len, new_msgs, new_len); + return old_tail_len + new_len; + } +} + +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +/* our kmsg_dump callback */ +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len) +{ + static unsigned int oops_count = 0; + size_t text_len; + + if (clobbering_unread_rtas_event()) + return; + + text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len, + oops_buf, oops_log_partition.size); + (void) nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) text_len, ERR_TYPE_KERNEL_PANIC, ++oops_count); +} diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index d345bfd56bb..2a0089a2c82 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -114,10 +114,13 @@ static void __init fwnmi_init(void) static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc) { + struct irq_chip *chip = get_irq_desc_chip(desc); unsigned int cascade_irq = i8259_irq(); + if (cascade_irq != NO_IRQ) generic_handle_irq(cascade_irq); - desc->chip->eoi(irq); + + chip->irq_eoi(&desc->irq_data); } static void __init pseries_setup_i8259_cascade(void) diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index 7b96e5a270c..01fea46c033 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -202,20 +202,20 @@ static int get_irq_server(unsigned int virq, const struct cpumask *cpumask, #define get_irq_server(virq, cpumask, strict_check) (default_server) #endif -static void xics_unmask_irq(unsigned int virq) +static void xics_unmask_irq(struct irq_data *d) { unsigned int irq; int call_status; int server; - pr_devel("xics: unmask virq %d\n", virq); + pr_devel("xics: unmask virq %d\n", d->irq); - irq = (unsigned int)irq_map[virq].hwirq; + irq = (unsigned int)irq_map[d->irq].hwirq; pr_devel(" -> map to hwirq 0x%x\n", irq); if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) return; - server = get_irq_server(virq, irq_to_desc(virq)->affinity, 0); + server = get_irq_server(d->irq, d->affinity, 0); call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, DEFAULT_PRIORITY); @@ -235,61 +235,61 @@ static void xics_unmask_irq(unsigned int virq) } } -static unsigned int xics_startup(unsigned int virq) +static unsigned int xics_startup(struct irq_data *d) { /* * The generic MSI code returns with the interrupt disabled on the * card, using the MSI mask bits. Firmware doesn't appear to unmask * at that level, so we do it here by hand. */ - if (irq_to_desc(virq)->msi_desc) - unmask_msi_irq(irq_get_irq_data(virq)); + if (d->msi_desc) + unmask_msi_irq(d); /* unmask it */ - xics_unmask_irq(virq); + xics_unmask_irq(d); return 0; } -static void xics_mask_real_irq(unsigned int irq) +static void xics_mask_real_irq(struct irq_data *d) { int call_status; - if (irq == XICS_IPI) + if (d->irq == XICS_IPI) return; - call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq); + call_status = rtas_call(ibm_int_off, 1, 1, NULL, d->irq); if (call_status != 0) { printk(KERN_ERR "%s: ibm_int_off irq=%u returned %d\n", - __func__, irq, call_status); + __func__, d->irq, call_status); return; } /* Have to set XIVE to 0xff to be able to remove a slot */ - call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, + call_status = rtas_call(ibm_set_xive, 3, 1, NULL, d->irq, default_server, 0xff); if (call_status != 0) { printk(KERN_ERR "%s: ibm_set_xive(0xff) irq=%u returned %d\n", - __func__, irq, call_status); + __func__, d->irq, call_status); return; } } -static void xics_mask_irq(unsigned int virq) +static void xics_mask_irq(struct irq_data *d) { unsigned int irq; - pr_devel("xics: mask virq %d\n", virq); + pr_devel("xics: mask virq %d\n", d->irq); - irq = (unsigned int)irq_map[virq].hwirq; + irq = (unsigned int)irq_map[d->irq].hwirq; if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) return; - xics_mask_real_irq(irq); + xics_mask_real_irq(d); } static void xics_mask_unknown_vec(unsigned int vec) { printk(KERN_ERR "Interrupt %u (real) is invalid, disabling it.\n", vec); - xics_mask_real_irq(vec); + xics_mask_real_irq(irq_get_irq_data(vec)); } static inline unsigned int xics_xirr_vector(unsigned int xirr) @@ -371,30 +371,31 @@ static unsigned char pop_cppr(void) return os_cppr->stack[--os_cppr->index]; } -static void xics_eoi_direct(unsigned int virq) +static void xics_eoi_direct(struct irq_data *d) { - unsigned int irq = (unsigned int)irq_map[virq].hwirq; + unsigned int irq = (unsigned int)irq_map[d->irq].hwirq; iosync(); direct_xirr_info_set((pop_cppr() << 24) | irq); } -static void xics_eoi_lpar(unsigned int virq) +static void xics_eoi_lpar(struct irq_data *d) { - unsigned int irq = (unsigned int)irq_map[virq].hwirq; + unsigned int irq = (unsigned int)irq_map[d->irq].hwirq; iosync(); lpar_xirr_info_set((pop_cppr() << 24) | irq); } -static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) +static int +xics_set_affinity(struct irq_data *d, const struct cpumask *cpumask, bool force) { unsigned int irq; int status; int xics_status[2]; int irq_server; - irq = (unsigned int)irq_map[virq].hwirq; + irq = (unsigned int)irq_map[d->irq].hwirq; if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) return -1; @@ -406,13 +407,13 @@ static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) return -1; } - irq_server = get_irq_server(virq, cpumask, 1); + irq_server = get_irq_server(d->irq, cpumask, 1); if (irq_server == -1) { char cpulist[128]; cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask); printk(KERN_WARNING "%s: No online cpus in the mask %s for irq %d\n", - __func__, cpulist, virq); + __func__, cpulist, d->irq); return -1; } @@ -430,20 +431,20 @@ static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) static struct irq_chip xics_pic_direct = { .name = "XICS", - .startup = xics_startup, - .mask = xics_mask_irq, - .unmask = xics_unmask_irq, - .eoi = xics_eoi_direct, - .set_affinity = xics_set_affinity + .irq_startup = xics_startup, + .irq_mask = xics_mask_irq, + .irq_unmask = xics_unmask_irq, + .irq_eoi = xics_eoi_direct, + .irq_set_affinity = xics_set_affinity }; static struct irq_chip xics_pic_lpar = { .name = "XICS", - .startup = xics_startup, - .mask = xics_mask_irq, - .unmask = xics_unmask_irq, - .eoi = xics_eoi_lpar, - .set_affinity = xics_set_affinity + .irq_startup = xics_startup, + .irq_mask = xics_mask_irq, + .irq_unmask = xics_unmask_irq, + .irq_eoi = xics_eoi_lpar, + .irq_set_affinity = xics_set_affinity }; @@ -890,6 +891,7 @@ void xics_migrate_irqs_away(void) for_each_irq(virq) { struct irq_desc *desc; + struct irq_chip *chip; int xics_status[2]; int status; unsigned long flags; @@ -903,12 +905,15 @@ void xics_migrate_irqs_away(void) /* We need to get IPIs still. */ if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) continue; + desc = irq_to_desc(virq); /* We only need to migrate enabled IRQS */ - if (desc == NULL || desc->chip == NULL - || desc->action == NULL - || desc->chip->set_affinity == NULL) + if (desc == NULL || desc->action == NULL) + continue; + + chip = get_irq_desc_chip(desc); + if (chip == NULL || chip->irq_set_affinity == NULL) continue; raw_spin_lock_irqsave(&desc->lock, flags); @@ -934,8 +939,8 @@ void xics_migrate_irqs_away(void) virq, cpu); /* Reset affinity to all cpus */ - cpumask_setall(irq_to_desc(virq)->affinity); - desc->chip->set_affinity(virq, cpu_all_mask); + cpumask_setall(desc->irq_data.affinity); + chip->irq_set_affinity(&desc->irq_data, cpu_all_mask, true); unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); } |