diff options
author | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2012-01-27 11:14:02 -0500 |
---|---|---|
committer | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2012-01-27 11:14:02 -0500 |
commit | 6c02b7b1610f873888af20f291c07730889ff0f9 (patch) | |
tree | 1b33e6642cc81605b8d37c0bda0abff0ba64fa2d /arch/powerpc/platforms/powernv | |
parent | 7a7546b377bdaa25ac77f33d9433c59f259b9688 (diff) | |
parent | dcd6c92267155e70a94b3927bce681ce74b80d1f (diff) |
Merge commit 'v3.3-rc1' into stable/for-linus-fixes-3.3
* commit 'v3.3-rc1': (9775 commits)
Linux 3.3-rc1
x86, syscall: Need __ARCH_WANT_SYS_IPC for 32 bits
qnx4: don't leak ->BitMap on late failure exits
qnx4: reduce the insane nesting in qnx4_checkroot()
qnx4: di_fname is an array, for crying out loud...
KEYS: Permit key_serial() to be called with a const key pointer
keys: fix user_defined key sparse messages
ima: fix cred sparse warning
uml: fix compile for x86-64
MPILIB: Add a missing ENOMEM check
tpm: fix (ACPI S3) suspend regression
nvme: fix merge error due to change of 'make_request_fn' fn type
xen: using EXPORT_SYMBOL requires including export.h
gpio: tps65910: Use correct offset for gpio initialization
acpi/apei/einj: Add extensions to EINJ from rev 5.0 of acpi spec
intel_idle: Split up and provide per CPU initialization func
ACPI processor: Remove unneeded variable passed by acpi_processor_hotadd_init V2
tg3: Fix single-vector MSI-X code
openvswitch: Fix multipart datapath dumps.
ipv6: fix per device IP snmp counters
...
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-wrappers.S | 8 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 1330 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 228 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 100 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/smp.c | 2 |
7 files changed, 1644 insertions, 27 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 31853008b41..bcc3cb48a44 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -2,4 +2,4 @@ obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o obj-y += opal-rtc.o opal-nvram.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o +obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 4a3f46d8533..3bb07e5e43c 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -99,3 +99,11 @@ OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c new file mode 100644 index 00000000000..f31162cfdaa --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -0,0 +1,1330 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/abs_addr.h> + +#include "powernv.h" +#include "pci.h" + +struct resource_wrap { + struct list_head link; + resource_size_t size; + resource_size_t align; + struct pci_dev *dev; /* Set if it's a device */ + struct pci_bus *bus; /* Set if it's a bridge */ +}; + +static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe, + struct va_format *vaf) +{ + char pfix[32]; + + if (pe->pdev) + strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); + else + sprintf(pfix, "%04x:%02x ", + pci_domain_nr(pe->pbus), pe->pbus->number); + return printk("pci %s%s: [PE# %.3d] %pV", level, pfix, pe->pe_number, vaf); +} + +#define define_pe_printk_level(func, kern_level) \ +static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + int r; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __pe_printk(kern_level, pe, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ + +define_pe_printk_level(pe_err, KERN_ERR); +define_pe_printk_level(pe_warn, KERN_WARNING); +define_pe_printk_level(pe_info, KERN_INFO); + + +/* Calculate resource usage & alignment requirement of a single + * device. This will also assign all resources within the device + * for a given type starting at 0 for the biggest one and then + * assigning in decreasing order of size. + */ +static void __devinit pnv_ioda_calc_dev(struct pci_dev *dev, unsigned int flags, + resource_size_t *size, + resource_size_t *align) +{ + resource_size_t start; + struct resource *r; + int i; + + pr_devel(" -> CDR %s\n", pci_name(dev)); + + *size = *align = 0; + + /* Clear the resources out and mark them all unset */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (!(r->flags & flags)) + continue; + if (r->start) { + r->end -= r->start; + r->start = 0; + } + r->flags |= IORESOURCE_UNSET; + } + + /* We currently keep all memory resources together, we + * will handle prefetch & 64-bit separately in the future + * but for now we stick everybody in M32 + */ + start = 0; + for (;;) { + resource_size_t max_size = 0; + int max_no = -1; + + /* Find next biggest resource */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (!(r->flags & IORESOURCE_UNSET) || + !(r->flags & flags)) + continue; + if (resource_size(r) > max_size) { + max_size = resource_size(r); + max_no = i; + } + } + if (max_no < 0) + break; + r = &dev->resource[max_no]; + if (max_size > *align) + *align = max_size; + *size += max_size; + r->start = start; + start += max_size; + r->end = r->start + max_size - 1; + r->flags &= ~IORESOURCE_UNSET; + pr_devel(" -> R%d %016llx..%016llx\n", + max_no, r->start, r->end); + } + pr_devel(" <- CDR %s size=%llx align=%llx\n", + pci_name(dev), *size, *align); +} + +/* Allocate a resource "wrap" for a given device or bridge and + * insert it at the right position in the sorted list + */ +static void __devinit pnv_ioda_add_wrap(struct list_head *list, + struct pci_bus *bus, + struct pci_dev *dev, + resource_size_t size, + resource_size_t align) +{ + struct resource_wrap *w1, *w = kzalloc(sizeof(*w), GFP_KERNEL); + + w->size = size; + w->align = align; + w->dev = dev; + w->bus = bus; + + list_for_each_entry(w1, list, link) { + if (w1->align < align) { + list_add_tail(&w->link, &w1->link); + return; + } + } + list_add_tail(&w->link, list); +} + +/* Offset device resources of a given type */ +static void __devinit pnv_ioda_offset_dev(struct pci_dev *dev, + unsigned int flags, + resource_size_t offset) +{ + struct resource *r; + int i; + + pr_devel(" -> ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); + + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + r = &dev->resource[i]; + if (r->flags & flags) { + dev->resource[i].start += offset; + dev->resource[i].end += offset; + } + } + + pr_devel(" <- ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); +} + +/* Offset bus resources (& all children) of a given type */ +static void __devinit pnv_ioda_offset_bus(struct pci_bus *bus, + unsigned int flags, + resource_size_t offset) +{ + struct resource *r; + struct pci_dev *dev; + struct pci_bus *cbus; + int i; + + pr_devel(" -> OBR %s [%x] +%016llx\n", + bus->self ? pci_name(bus->self) : "root", flags, offset); + + for (i = 0; i < 2; i++) { + r = bus->resource[i]; + if (r && (r->flags & flags)) { + bus->resource[i]->start += offset; + bus->resource[i]->end += offset; + } + } + list_for_each_entry(dev, &bus->devices, bus_list) + pnv_ioda_offset_dev(dev, flags, offset); + list_for_each_entry(cbus, &bus->children, node) + pnv_ioda_offset_bus(cbus, flags, offset); + + pr_devel(" <- OBR %s [%x]\n", + bus->self ? pci_name(bus->self) : "root", flags); +} + +/* This is the guts of our IODA resource allocation. This is called + * recursively for each bus in the system. It calculates all the + * necessary size and requirements for children and assign them + * resources such that: + * + * - Each function fits in it's own contiguous set of IO/M32 + * segment + * + * - All segments behind a P2P bridge are contiguous and obey + * alignment constraints of those bridges + */ +static void __devinit pnv_ioda_calc_bus(struct pci_bus *bus, unsigned int flags, + resource_size_t *size, + resource_size_t *align) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + resource_size_t dev_size, dev_align, start; + resource_size_t min_align, min_balign; + struct pci_dev *cdev; + struct pci_bus *cbus; + struct list_head head; + struct resource_wrap *w; + unsigned int bres; + + *size = *align = 0; + + pr_devel("-> CBR %s [%x]\n", + bus->self ? pci_name(bus->self) : "root", flags); + + /* Calculate alignment requirements based on the type + * of resource we are working on + */ + if (flags & IORESOURCE_IO) { + bres = 0; + min_align = phb->ioda.io_segsize; + min_balign = 0x1000; + } else { + bres = 1; + min_align = phb->ioda.m32_segsize; + min_balign = 0x100000; + } + + /* Gather all our children resources ordered by alignment */ + INIT_LIST_HEAD(&head); + + /* - Busses */ + list_for_each_entry(cbus, &bus->children, node) { + pnv_ioda_calc_bus(cbus, flags, &dev_size, &dev_align); + pnv_ioda_add_wrap(&head, cbus, NULL, dev_size, dev_align); + } + + /* - Devices */ + list_for_each_entry(cdev, &bus->devices, bus_list) { + pnv_ioda_calc_dev(cdev, flags, &dev_size, &dev_align); + /* Align them to segment size */ + if (dev_align < min_align) + dev_align = min_align; + pnv_ioda_add_wrap(&head, NULL, cdev, dev_size, dev_align); + } + if (list_empty(&head)) + goto empty; + + /* Now we can do two things: assign offsets to them within that + * level and get our total alignment & size requirements. The + * assignment algorithm is going to be uber-trivial for now, we + * can try to be smarter later at filling out holes. + */ + start = bus->self ? 0 : bus->resource[bres]->start; + + /* Don't hand out IO 0 */ + if ((flags & IORESOURCE_IO) && !bus->self) + start += 0x1000; + + while(!list_empty(&head)) { + w = list_first_entry(&head, struct resource_wrap, link); + list_del(&w->link); + if (w->size) { + if (start) { + start = ALIGN(start, w->align); + if (w->dev) + pnv_ioda_offset_dev(w->dev,flags,start); + else if (w->bus) + pnv_ioda_offset_bus(w->bus,flags,start); + } + if (w->align > *align) + *align = w->align; + } + start += w->size; + kfree(w); + } + *size = start; + + /* Align and setup bridge resources */ + *align = max_t(resource_size_t, *align, + max_t(resource_size_t, min_align, min_balign)); + *size = ALIGN(*size, + max_t(resource_size_t, min_align, min_balign)); + empty: + /* Only setup P2P's, not the PHB itself */ + if (bus->self) { + WARN_ON(bus->resource[bres] == NULL); + bus->resource[bres]->start = 0; + bus->resource[bres]->flags = (*size) ? flags : 0; + bus->resource[bres]->end = (*size) ? (*size - 1) : 0; + + /* Clear prefetch bus resources for now */ + bus->resource[2]->flags = 0; + } + + pr_devel("<- CBR %s [%x] *size=%016llx *align=%016llx\n", + bus->self ? pci_name(bus->self) : "root", flags,*size,*align); +} + +static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev) +{ + struct device_node *np; + + np = pci_device_to_OF_node(dev); + if (!np) + return NULL; + return PCI_DN(np); +} + +static void __devinit pnv_ioda_setup_pe_segments(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + unsigned int pe, i; + resource_size_t pos; + struct resource io_res; + struct resource m32_res; + struct pci_bus_region region; + int rc; + + /* Anything not referenced in the device-tree gets PE#0 */ + pe = pdn ? pdn->pe_number : 0; + + /* Calculate the device min/max */ + io_res.start = m32_res.start = (resource_size_t)-1; + io_res.end = m32_res.end = 0; + io_res.flags = IORESOURCE_IO; + m32_res.flags = IORESOURCE_MEM; + + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *r = NULL; + if (dev->resource[i].flags & IORESOURCE_IO) + r = &io_res; + if (dev->resource[i].flags & IORESOURCE_MEM) + r = &m32_res; + if (!r) + continue; + if (dev->resource[i].start < r->start) + r->start = dev->resource[i].start; + if (dev->resource[i].end > r->end) + r->end = dev->resource[i].end; + } + + /* Setup IO segments */ + if (io_res.start < io_res.end) { + pcibios_resource_to_bus(dev, ®ion, &io_res); + pos = region.start; + i = pos / phb->ioda.io_segsize; + while(i < phb->ioda.total_pe && pos <= region.end) { + if (phb->ioda.io_segmap[i]) { + pr_err("%s: Trying to use IO seg #%d which is" + " already used by PE# %d\n", + pci_name(dev), i, + phb->ioda.io_segmap[i]); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + phb->ioda.io_segmap[i] = pe; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, + OPAL_IO_WINDOW_TYPE, + 0, i); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d setting up mapping" + " for IO seg# %d\n", + pci_name(dev), rc, i); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + pos += phb->ioda.io_segsize; + i++; + }; + } + + /* Setup M32 segments */ + if (m32_res.start < m32_res.end) { + pcibios_resource_to_bus(dev, ®ion, &m32_res); + pos = region.start; + i = pos / phb->ioda.m32_segsize; + while(i < phb->ioda.total_pe && pos <= region.end) { + if (phb->ioda.m32_segmap[i]) { + pr_err("%s: Trying to use M32 seg #%d which is" + " already used by PE# %d\n", + pci_name(dev), i, + phb->ioda.m32_segmap[i]); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + phb->ioda.m32_segmap[i] = pe; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, + OPAL_M32_WINDOW_TYPE, + 0, i); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d setting up mapping" + " for M32 seg# %d\n", + pci_name(dev), rc, i); + /* XXX DO SOMETHING TO DISABLE DEVICE ? */ + break; + } + pos += phb->ioda.m32_segsize; + i++; + } + } +} + +/* Check if a resource still fits in the total IO or M32 range + * for a given PHB + */ +static int __devinit pnv_ioda_resource_fit(struct pci_controller *hose, + struct resource *r) +{ + struct resource *bounds; + + if (r->flags & IORESOURCE_IO) + bounds = &hose->io_resource; + else if (r->flags & IORESOURCE_MEM) + bounds = &hose->mem_resources[0]; + else + return 1; + + if (r->start >= bounds->start && r->end <= bounds->end) + return 1; + r->flags = 0; + return 0; +} + +static void __devinit pnv_ioda_update_resources(struct pci_bus *bus) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pci_bus *cbus; + struct pci_dev *cdev; + unsigned int i; + + /* We used to clear all device enables here. However it looks like + * clearing MEM enable causes Obsidian (IPR SCS) to go bonkers, + * and shoot fatal errors to the PHB which in turns fences itself + * and we can't recover from that ... yet. So for now, let's leave + * the enables as-is and hope for the best. + */ + + /* Check if bus resources fit in our IO or M32 range */ + for (i = 0; bus->self && (i < 2); i++) { + struct resource *r = bus->resource[i]; + if (r && !pnv_ioda_resource_fit(hose, r)) + pr_err("%s: Bus %d resource %d disabled, no room\n", + pci_name(bus->self), bus->number, i); + } + + /* Update self if it's not a PHB */ + if (bus->self) + pci_setup_bridge(bus); + + /* Update child devices */ + list_for_each_entry(cdev, &bus->devices, bus_list) { + /* Check if resource fits, if not, disabled it */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *r = &cdev->resource[i]; + if (!pnv_ioda_resource_fit(hose, r)) + pr_err("%s: Resource %d disabled, no room\n", + pci_name(cdev), i); + } + + /* Assign segments */ + pnv_ioda_setup_pe_segments(cdev); + + /* Update HW BARs */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) + pci_update_resource(cdev, i); + } + + /* Update child busses */ + list_for_each_entry(cbus, &bus->children, node) + pnv_ioda_update_resources(cbus); +} + +static int __devinit pnv_ioda_alloc_pe(struct pnv_phb *phb) +{ + unsigned long pe; + + do { + pe = find_next_zero_bit(phb->ioda.pe_alloc, + phb->ioda.total_pe, 0); + if (pe >= phb->ioda.total_pe) + return IODA_INVALID_PE; + } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); + + phb->ioda.pe_array[pe].pe_number = pe; + return pe; +} + +static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +{ + WARN_ON(phb->ioda.pe_array[pe].pdev); + + memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); + clear_bit(pe, phb->ioda.pe_alloc); +} + +/* Currently those 2 are only used when MSIs are enabled, this will change + * but in the meantime, we need to protect them to avoid warnings + */ +#ifdef CONFIG_PCI_MSI +static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (!pdn) + return NULL; + if (pdn->pe_number == IODA_INVALID_PE) + return NULL; + return &phb->ioda.pe_array[pdn->pe_number]; +} + +static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev) +{ + struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev); + + while (!pe && dev->bus->self) { + dev = dev->bus->self; + pe = __pnv_ioda_get_one_pe(dev); + if (pe) + pe = pe->bus_pe; + } + return pe; +} +#endif /* CONFIG_PCI_MSI */ + +static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct pci_dev *parent; + uint8_t bcomp, dcomp, fcomp; + long rc, rid_end, rid; + + /* Bus validation ? */ + if (pe->pbus) { + int count; + + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; + parent = pe->pbus->self; + count = pe->pbus->subordinate - pe->pbus->secondary + 1; + switch(count) { + case 1: bcomp = OpalPciBusAll; break; + case 2: bcomp = OpalPciBus7Bits; break; + case 4: bcomp = OpalPciBus6Bits; break; + case 8: bcomp = OpalPciBus5Bits; break; + case 16: bcomp = OpalPciBus4Bits; break; + case 32: bcomp = OpalPciBus3Bits; break; + default: + pr_err("%s: Number of subordinate busses %d" + " unsupported\n", + pci_name(pe->pbus->self), count); + /* Do an exact match only */ + bcomp = OpalPciBusAll; + } + rid_end = pe->rid + (count << 8); + } else { + parent = pe->pdev->bus->self; + bcomp = OpalPciBusAll; + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; + rid_end = pe->rid + 1; + } + + /* Associate PE in PELT */ + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, + bcomp, dcomp, fcomp, OPAL_MAP_PE); + if (rc) { + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + return -ENXIO; + } + opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Add to all parents PELT-V */ + while (parent) { + struct pci_dn *pdn = pnv_ioda_get_pdn(parent); + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + /* Setup reverse map */ + for (rid = pe->rid; rid < rid_end; rid++) + phb->ioda.pe_rmap[rid] = pe->pe_number; + + /* Setup one MVTs on IODA1 */ + if (phb->type == PNV_PHB_IODA1) { + pe->mve_number = pe->pe_number; + rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, + pe->pe_number); + if (rc) { + pe_err(pe, "OPAL error %ld setting up MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } else { + rc = opal_pci_set_mve_enable(phb->opal_id, + pe->mve_number, OPAL_ENABLE_MVE); + if (rc) { + pe_err(pe, "OPAL error %ld enabling MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } + } + } else if (phb->type == PNV_PHB_IODA2) + pe->mve_number = 0; + + return 0; +} + +static void __devinit pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct pnv_ioda_pe *lpe; + + list_for_each_entry(lpe, &phb->ioda.pe_list, link) { + if (lpe->dma_weight < pe->dma_weight) { + list_add_tail(&pe->link, &lpe->link); + return; + } + } + list_add_tail(&pe->link, &phb->ioda.pe_list); +} + +static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) +{ + /* This is quite simplistic. The "base" weight of a device + * is 10. 0 means no DMA is to be accounted for it. + */ + + /* If it's a bridge, no DMA */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return 0; + + /* Reduce the weight of slow USB controllers */ + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || + dev->class == PCI_CLASS_SERIAL_USB_OHCI || + dev->class == PCI_CLASS_SERIAL_USB_EHCI) + return 3; + + /* Increase the weight of RAID (includes Obsidian) */ + if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) + return 15; + + /* Default */ + return 10; +} + +static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + struct pnv_ioda_pe *pe; + int pe_num; + + if (!pdn) { + pr_err("%s: Device tree node not associated properly\n", + pci_name(dev)); + return NULL; + } + if (pdn->pe_number != IODA_INVALID_PE) + return NULL; + + /* PE#0 has been pre-set */ + if (dev->bus->number == 0) + pe_num = 0; + else + pe_num = pnv_ioda_alloc_pe(phb); + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available, disabling device\n", + pci_name(dev)); + return NULL; + } + + /* NOTE: We get only one ref to the pci_dev for the pdn, not for the + * pointer in the PE data structure, both should be destroyed at the + * same time. However, this needs to be looked at more closely again + * once we actually start removing things (Hotplug, SR-IOV, ...) + * + * At some point we want to remove the PDN completely anyways + */ + pe = &phb->ioda.pe_array[pe_num]; + pci_dev_get(dev); + pdn->pcidev = dev; + pdn->pe_number = pe_num; + pe->pdev = dev; + pe->pbus = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = dev->bus->number << 8 | pdn->devfn; + + pe_info(pe, "Associated device to PE\n"); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pdn->pe_number = IODA_INVALID_PE; + pe->pdev = NULL; + pci_dev_put(dev); + return NULL; + } + + /* Assign a DMA weight to the device */ + pe->dma_weight = pnv_ioda_dma_weight(dev); + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); + + return pe; +} + +static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (pdn == NULL) { + pr_warn("%s: No device node associated with device !\n", + pci_name(dev)); + continue; + } + pci_dev_get(dev); + pdn->pcidev = dev; + pdn->pe_number = pe->pe_number; + pe->dma_weight += pnv_ioda_dma_weight(dev); + if (dev->subordinate) + pnv_ioda_setup_same_PE(dev->subordinate, pe); + } +} + +static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, + struct pnv_ioda_pe *ppe) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_bus *bus = dev->subordinate; + struct pnv_ioda_pe *pe; + int pe_num; + + if (!bus) { + pr_warning("%s: Bridge without a subordinate bus !\n", + pci_name(dev)); + return; + } + pe_num = pnv_ioda_alloc_pe(phb); + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available, disabling bus\n", + pci_name(dev)); + return; + } + + pe = &phb->ioda.pe_array[pe_num]; + ppe->bus_pe = pe; + pe->pbus = bus; + pe->pdev = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = bus->secondary << 8; + pe->dma_weight = 0; + + pe_info(pe, "Secondary busses %d..%d associated with PE\n", + bus->secondary, bus->subordinate); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pe->pbus = NULL; + return; + } + + /* Associate it with all child devices */ + pnv_ioda_setup_same_PE(bus, pe); + + /* Account for one DMA PE if at least one DMA capable device exist + * below the bridge + */ + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); +} + +static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus) +{ + struct pci_dev *dev; + struct pnv_ioda_pe *pe; + + list_for_each_entry(dev, &bus->devices, bus_list) { + pe = pnv_ioda_setup_dev_PE(dev); + if (pe == NULL) + continue; + /* Leaving the PCIe domain ... single PE# */ + if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) + pnv_ioda_setup_bus_PE(dev, pe); + else if (dev->subordinate) + pnv_ioda_setup_PEs(dev->subordinate); + } +} + +static void __devinit pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, + struct pci_dev *dev) +{ + /* We delay DMA setup after we have assigned all PE# */ +} + +static void __devinit pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, + struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, &pe->tce32_table); + if (dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + +static void __devinit pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + unsigned int base, + unsigned int segs) +{ + + struct page *tce_mem = NULL; + const __be64 *swinvp; + struct iommu_table *tbl; + unsigned int i; + int64_t rc; + void *addr; + + /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ +#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) + + /* XXX FIXME: Handle 64-bit only DMA devices */ + /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ + /* XXX FIXME: Allocate multi-level tables on PHB3 */ + + /* We shouldn't already have a 32-bit DMA associated */ + if (WARN_ON(pe->tce32_seg >= 0)) + return; + + /* Grab a 32-bit TCE table */ + pe->tce32_seg = base; + pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", + (base << 28), ((base + segs) << 28) - 1); + + /* XXX Currently, we allocate one big contiguous table for the + * TCEs. We only really need one chunk per 256M of TCE space + * (ie per segment) but that's an optimization for later, it + * requires some added smarts with our get/put_tce implementation + */ + tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, + get_order(TCE32_TABLE_SIZE * segs)); + if (!tce_mem) { + pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, TCE32_TABLE_SIZE * segs); + + /* Configure HW */ + for (i = 0; i < segs; i++) { + rc = opal_pci_map_pe_dma_window(phb->opal_id, + pe->pe_number, + base + i, 1, + __pa(addr) + TCE32_TABLE_SIZE * i, + TCE32_TABLE_SIZE, 0x1000); + if (rc) { + pe_err(pe, " Failed to configure 32-bit TCE table," + " err %ld\n", rc); + goto fail; + } + } + + /* Setup linux iommu table */ + tbl = &pe->tce32_table; + pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, + base << 28); + + /* OPAL variant of P7IOC SW invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data + * to or. Since the bus is only printed out on table free + * errors, and on the first pass the data will be a relative + * bus number, print that out instead. + */ + tbl->it_busno = 0; + tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE + | TCE_PCI_SWINV_PAIR; + } + iommu_init_table(tbl, phb->hose->node); + + if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe->pbus); + + return; + fail: + /* XXX Failure: Try to fallback to 64-bit only ? */ + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + if (tce_mem) + __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); +} + +static void __devinit pnv_ioda_setup_dma(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + unsigned int residual, remaining, segs, tw, base; + struct pnv_ioda_pe *pe; + + /* If we have more PE# than segments available, hand out one + * per PE until we run out and let the rest fail. If not, + * then we assign at least one segment per PE, plus more based + * on the amount of devices under that PE + */ + if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) + residual = 0; + else + residual = phb->ioda.tce32_count - + phb->ioda.dma_pe_count; + + pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", + hose->global_number, phb->ioda.tce32_count); + pr_info("PCI: %d PE# for a total weight of %d\n", + phb->ioda.dma_pe_count, phb->ioda.dma_weight); + + /* Walk our PE list and configure their DMA segments, hand them + * out one base segment plus any residual segments based on + * weight + */ + remaining = phb->ioda.tce32_count; + tw = phb->ioda.dma_weight; + base = 0; + list_for_each_entry(pe, &phb->ioda.pe_list, link) { + if (!pe->dma_weight) + continue; + if (!remaining) { + pe_warn(pe, "No DMA32 resources available\n"); + continue; + } + segs = 1; + if (residual) { + segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; + if (segs > remaining) + segs = remaining; + } + pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", + pe->dma_weight, segs); + pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + remaining -= segs; + base += segs; + } +} + +#ifdef CONFIG_PCI_MSI +static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int is_64, + struct msi_msg *msg) +{ + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); + unsigned int xive_num = hwirq - phb->msi_base; + uint64_t addr64; + uint32_t addr32, data; + int rc; + + /* No PE assigned ? bail out ... no MSI for you ! */ + if (pe == NULL) + return -ENXIO; + + /* Check if we have an MVE */ + if (pe->mve_number < 0) + return -ENXIO; + + /* Assign XIVE to PE */ + rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); + if (rc) { + pr_warn("%s: OPAL error %d setting XIVE %d PE\n", + pci_name(dev), rc, xive_num); + return -EIO; + } + + if (is_64) { + rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, + &addr64, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = addr64 >> 32; + msg->address_lo = addr64 & 0xfffffffful; + } else { + rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, + &addr32, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = 0; + msg->address_lo = addr32; + } + msg->data = data; + + pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," + " address=%x_%08x data=%x PE# %d\n", + pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, + msg->address_hi, msg->address_lo, data, pe->pe_number); + + return 0; +} + +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +{ + unsigned int bmap_size; + const __be32 *prop = of_get_property(phb->hose->dn, + "ibm,opal-msi-ranges", NULL); + if (!prop) { + /* BML Fallback */ + prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); + } + if (!prop) + return; + + phb->msi_base = be32_to_cpup(prop); + phb->msi_count = be32_to_cpup(prop + 1); + bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long); + phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL); + if (!phb->msi_map) { + pr_err("PCI %d: Failed to allocate MSI bitmap !\n", + phb->hose->global_number); + return; + } + phb->msi_setup = pnv_pci_ioda_msi_setup; + phb->msi32_support = 1; + pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", + phb->msi_count, phb->msi_base); +} +#else +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +/* This is the starting point of our IODA specific resource + * allocation process + */ +static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose) +{ + resource_size_t size, align; + struct pci_bus *child; + + /* Associate PEs per functions */ + pnv_ioda_setup_PEs(hose->bus); + + /* Calculate all resources */ + pnv_ioda_calc_bus(hose->bus, IORESOURCE_IO, &size, &align); + pnv_ioda_calc_bus(hose->bus, IORESOURCE_MEM, &size, &align); + + /* Apply then to HW */ + pnv_ioda_update_resources(hose->bus); + + /* Setup DMA */ + pnv_ioda_setup_dma(hose->private_data); + + /* Configure PCI Express settings */ + list_for_each_entry(child, &hose->bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + pcie_bus_configure_settings(child, self->pcie_mpss); + } +} + +/* Prevent enabling devices for which we couldn't properly + * assign a PE + */ +static int __devinit pnv_pci_enable_device_hook(struct pci_dev *dev) +{ + struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return -EINVAL; + return 0; +} + +static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, + u32 devfn) +{ + return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; +} + +void __init pnv_pci_init_ioda1_phb(struct device_node *np) +{ + struct pci_controller *hose; + static int primary = 1; + struct pnv_phb *phb; + unsigned long size, m32map_off, iomap_off, pemap_off; + const u64 *prop64; + u64 phb_id; + void *aux; + long rc; + + pr_info(" Initializing IODA OPAL PHB %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-phbid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-phbid\" property !\n"); + return; + } + phb_id = be64_to_cpup(prop64); + pr_debug(" PHB-ID : 0x%016llx\n", phb_id); + + phb = alloc_bootmem(sizeof(struct pnv_phb)); + if (phb) { + memset(phb, 0, sizeof(struct pnv_phb)); + phb->hose = hose = pcibios_alloc_controller(np); + } + if (!phb || !phb->hose) { + pr_err("PCI: Failed to allocate PCI controller for %s\n", + np->full_name); + return; + } + + spin_lock_init(&phb->lock); + /* XXX Use device-tree */ + hose->first_busno = 0; + hose->last_busno = 0xff; + hose->private_data = phb; + phb->opal_id = phb_id; + phb->type = PNV_PHB_IODA1; + + /* Detect specific models for error handling */ + if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) + phb->model = PNV_PHB_MODEL_P7IOC; + else + phb->model = PNV_PHB_MODEL_UNKNOWN; + + /* We parse "ranges" now since we need to deduce the register base + * from the IO base + */ + pci_process_bridge_OF_ranges(phb->hose, np, primary); + primary = 0; + + /* Magic formula from Milton */ + phb->regs = of_iomap(np, 0); + if (phb->regs == NULL) + pr_err(" Failed to map registers !\n"); + + + /* XXX This is hack-a-thon. This needs to be changed so that: + * - we obtain stuff like PE# etc... from device-tree + * - we properly re-allocate M32 ourselves + * (the OFW one isn't very good) + */ + + /* Initialize more IODA stuff */ + phb->ioda.total_pe = 128; + + phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); + /* OFW Has already off top 64k of M32 space (MSI space) */ + phb->ioda.m32_size += 0x10000; + + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; + phb->ioda.m32_pci_base = hose->mem_resources[0].start - + hose->pci_mem_offset; + phb->ioda.io_size = hose->pci_io_size; + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; + phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + + /* Allocate aux data & arrays */ + size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + m32map_off = size; + size += phb->ioda.total_pe; + iomap_off = size; + size += phb->ioda.total_pe; + pemap_off = size; + size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); + aux = alloc_bootmem(size); + memset(aux, 0, size); + phb->ioda.pe_alloc = aux; + phb->ioda.m32_segmap = aux + m32map_off; + phb->ioda.io_segmap = aux + iomap_off; + phb->ioda.pe_array = aux + pemap_off; + set_bit(0, phb->ioda.pe_alloc); + + INIT_LIST_HEAD(&phb->ioda.pe_list); + + /* Calculate how many 32-bit TCE segments we have */ + phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + + /* Clear unusable m64 */ + hose->mem_resources[1].flags = 0; + hose->mem_resources[1].start = 0; + hose->mem_resources[1].end = 0; + hose->mem_resources[2].flags = 0; + hose->mem_resources[2].start = 0; + hose->mem_resources[2].end = 0; + +#if 0 + rc = opal_pci_set_phb_mem_window(opal->phb_id, + window_type, + window_num, + starting_real_address, + starting_pci_address, + segment_size); +#endif + + pr_info(" %d PE's M32: 0x%x [segment=0x%x] IO: 0x%x [segment=0x%x]\n", + phb->ioda.total_pe, + phb->ioda.m32_size, phb->ioda.m32_segsize, + phb->ioda.io_size, phb->ioda.io_segsize); + + if (phb->regs) { + pr_devel(" BUID = 0x%016llx\n", in_be64(phb->regs + 0x100)); + pr_devel(" PHB2_CR = 0x%016llx\n", in_be64(phb->regs + 0x160)); + pr_devel(" IO_BAR = 0x%016llx\n", in_be64(phb->regs + 0x170)); + pr_devel(" IO_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x178)); + pr_devel(" IO_SAR = 0x%016llx\n", in_be64(phb->regs + 0x180)); + pr_devel(" M32_BAR = 0x%016llx\n", in_be64(phb->regs + 0x190)); + pr_devel(" M32_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x198)); + pr_devel(" M32_SAR = 0x%016llx\n", in_be64(phb->regs + 0x1a0)); + } + phb->hose->ops = &pnv_pci_ops; + + /* Setup RID -> PE mapping function */ + phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; + + /* Setup TCEs */ + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; + + /* Setup MSI support */ + pnv_pci_init_ioda_msis(phb); + + /* We set both probe_only and PCI_REASSIGN_ALL_RSRC. This is an + * odd combination which essentially means that we skip all resource + * fixups and assignments in the generic code, and do it all + * ourselves here + */ + pci_probe_only = 1; + ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb; + ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; + pci_add_flags(PCI_REASSIGN_ALL_RSRC); + + /* Reset IODA tables to a clean state */ + rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); + if (rc) + pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); + opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE); +} + +void __init pnv_pci_init_ioda_hub(struct device_node *np) +{ + struct device_node *phbn; + const u64 *prop64; + u64 hub_id; + + pr_info("Probing IODA IO-Hub %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-hubid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-hubid\" property !\n"); + return; + } + hub_id = be64_to_cpup(prop64); + pr_devel(" HUB-ID : 0x%016llx\n", hub_id); + + /* Count child PHBs */ + for_each_child_of_node(np, phbn) { + /* Look for IODA1 PHBs */ + if (of_device_is_compatible(phbn, "ibm,ioda-phb")) + pnv_pci_init_ioda1_phb(phbn); + } +} diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 4c80f7c77d5..264967770c3 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -137,6 +137,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, phb->hose->private_data = phb; phb->opal_id = phb_id; phb->type = PNV_PHB_P5IOC2; + phb->model = PNV_PHB_MODEL_P5IOC2; phb->regs = of_iomap(np, 0); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 85bb66d7f93..a70bc1e385e 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -144,6 +144,112 @@ static void pnv_teardown_msi_irqs(struct pci_dev *pdev) } #endif /* CONFIG_PCI_MSI */ +static void pnv_pci_dump_p7ioc_diag_data(struct pnv_phb *phb) +{ + struct OpalIoP7IOCPhbErrorData *data = &phb->diag.p7ioc; + int i; + + pr_info("PHB %d diagnostic data:\n", phb->hose->global_number); + + pr_info(" brdgCtl = 0x%08x\n", data->brdgCtl); + + pr_info(" portStatusReg = 0x%08x\n", data->portStatusReg); + pr_info(" rootCmplxStatus = 0x%08x\n", data->rootCmplxStatus); + pr_info(" busAgentStatus = 0x%08x\n", data->busAgentStatus); + + pr_info(" deviceStatus = 0x%08x\n", data->deviceStatus); + pr_info(" slotStatus = 0x%08x\n", data->slotStatus); + pr_info(" linkStatus = 0x%08x\n", data->linkStatus); + pr_info(" devCmdStatus = 0x%08x\n", data->devCmdStatus); + pr_info(" devSecStatus = 0x%08x\n", data->devSecStatus); + + pr_info(" rootErrorStatus = 0x%08x\n", data->rootErrorStatus); + pr_info(" uncorrErrorStatus = 0x%08x\n", data->uncorrErrorStatus); + pr_info(" corrErrorStatus = 0x%08x\n", data->corrErrorStatus); + pr_info(" tlpHdr1 = 0x%08x\n", data->tlpHdr1); + pr_info(" tlpHdr2 = 0x%08x\n", data->tlpHdr2); + pr_info(" tlpHdr3 = 0x%08x\n", data->tlpHdr3); + pr_info(" tlpHdr4 = 0x%08x\n", data->tlpHdr4); + pr_info(" sourceId = 0x%08x\n", data->sourceId); + + pr_info(" errorClass = 0x%016llx\n", data->errorClass); + pr_info(" correlator = 0x%016llx\n", data->correlator); + + pr_info(" p7iocPlssr = 0x%016llx\n", data->p7iocPlssr); + pr_info(" p7iocCsr = 0x%016llx\n", data->p7iocCsr); + pr_info(" lemFir = 0x%016llx\n", data->lemFir); + pr_info(" lemErrorMask = 0x%016llx\n", data->lemErrorMask); + pr_info(" lemWOF = 0x%016llx\n", data->lemWOF); + pr_info(" phbErrorStatus = 0x%016llx\n", data->phbErrorStatus); + pr_info(" phbFirstErrorStatus = 0x%016llx\n", data->phbFirstErrorStatus); + pr_info(" phbErrorLog0 = 0x%016llx\n", data->phbErrorLog0); + pr_info(" phbErrorLog1 = 0x%016llx\n", data->phbErrorLog1); + pr_info(" mmioErrorStatus = 0x%016llx\n", data->mmioErrorStatus); + pr_info(" mmioFirstErrorStatus = 0x%016llx\n", data->mmioFirstErrorStatus); + pr_info(" mmioErrorLog0 = 0x%016llx\n", data->mmioErrorLog0); + pr_info(" mmioErrorLog1 = 0x%016llx\n", data->mmioErrorLog1); + pr_info(" dma0ErrorStatus = 0x%016llx\n", data->dma0ErrorStatus); + pr_info(" dma0FirstErrorStatus = 0x%016llx\n", data->dma0FirstErrorStatus); + pr_info(" dma0ErrorLog0 = 0x%016llx\n", data->dma0ErrorLog0); + pr_info(" dma0ErrorLog1 = 0x%016llx\n", data->dma0ErrorLog1); + pr_info(" dma1ErrorStatus = 0x%016llx\n", data->dma1ErrorStatus); + pr_info(" dma1FirstErrorStatus = 0x%016llx\n", data->dma1FirstErrorStatus); + pr_info(" dma1ErrorLog0 = 0x%016llx\n", data->dma1ErrorLog0); + pr_info(" dma1ErrorLog1 = 0x%016llx\n", data->dma1ErrorLog1); + + for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { + if ((data->pestA[i] >> 63) == 0 && + (data->pestB[i] >> 63) == 0) + continue; + pr_info(" PE[%3d] PESTA = 0x%016llx\n", i, data->pestA[i]); + pr_info(" PESTB = 0x%016llx\n", data->pestB[i]); + } +} + +static void pnv_pci_dump_phb_diag_data(struct pnv_phb *phb) +{ + switch(phb->model) { + case PNV_PHB_MODEL_P7IOC: + pnv_pci_dump_p7ioc_diag_data(phb); + break; + default: + pr_warning("PCI %d: Can't decode this PHB diag data\n", + phb->hose->global_number); + } +} + +static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) +{ + unsigned long flags, rc; + int has_diag; + + spin_lock_irqsave(&phb->lock, flags); + + rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + has_diag = (rc == OPAL_SUCCESS); + + rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + if (rc) { + pr_warning("PCI %d: Failed to clear EEH freeze state" + " for PE#%d, err %ld\n", + phb->hose->global_number, pe_no, rc); + + /* For now, let's only display the diag buffer when we fail to clear + * the EEH status. We'll do more sensible things later when we have + * proper EEH support. We need to make sure we don't pollute ourselves + * with the normal errors generated when probing empty slots + */ + if (has_diag) + pnv_pci_dump_phb_diag_data(phb); + else + pr_warning("PCI %d: No diag data available\n", + phb->hose->global_number); + } + + spin_unlock_irqrestore(&phb->lock, flags); +} + static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus, u32 bdfn) { @@ -165,15 +271,8 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus, } cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n", bdfn, pe_no, fstate); - if (fstate != 0) { - rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); - if (rc) { - pr_warning("PCI %d: Failed to clear EEH freeze state" - " for PE#%d, err %lld\n", - phb->hose->global_number, pe_no, rc); - } - } + if (fstate != 0) + pnv_pci_handle_eeh_config(phb, pe_no); } static int pnv_pci_read_config(struct pci_bus *bus, @@ -257,12 +356,54 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; + +static void pnv_tce_invalidate(struct iommu_table *tbl, + u64 *startp, u64 *endp) +{ + u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; + unsigned long start, end, inc; + + start = __pa(startp); + end = __pa(endp); + + + /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ + if (tbl->it_busno) { + start <<= 12; + end <<= 12; + inc = 128 << 12; + start |= tbl->it_busno; + end |= tbl->it_busno; + } + /* p7ioc-style invalidation, 2 TCEs per write */ + else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { + start |= (1ull << 63); + end |= (1ull << 63); + inc = 16; + } + /* Default (older HW) */ + else + inc = 128; + + end |= inc - 1; /* round up end to be different than start */ + + mb(); /* Ensure above stores are visible */ + while (start <= end) { + __raw_writeq(start, invalidate); + start += inc; + } + /* The iommu layer will do another mb() for us on build() and + * we don't care on free() + */ +} + + static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 proto_tce; - u64 *tcep; + u64 *tcep, *tces; u64 rpn; proto_tce = TCE_PCI_READ; // Read allowed @@ -270,25 +411,33 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; - tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; + rpn = __pa(uaddr) >> TCE_SHIFT; - while (npages--) { - /* can't move this out since we might cross LMB boundary */ - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; - *tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + while (npages--) + *(tcep++) = proto_tce | (rpn++ << TCE_RPN_SHIFT); + + /* Some implementations won't cache invalid TCEs and thus may not + * need that flush. We'll probably turn it_type into a bit mask + * of flags if that becomes the case + */ + if (tbl->it_type & TCE_PCI_SWINV_CREATE) + pnv_tce_invalidate(tbl, tces, tcep - 1); - uaddr += TCE_PAGE_SIZE; - tcep++; - } return 0; } static void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { - u64 *tcep = ((u64 *)tbl->it_base) + index; + u64 *tcep, *tces; + + tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; while (npages--) *(tcep++) = 0; + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_tce_invalidate(tbl, tces, tcep - 1); } void pnv_pci_setup_iommu_table(struct iommu_table *tbl, @@ -308,13 +457,14 @@ static struct iommu_table * __devinit pnv_pci_setup_bml_iommu(struct pci_controller *hose) { struct iommu_table *tbl; - const __be64 *basep; + const __be64 *basep, *swinvp; const __be32 *sizep; basep = of_get_property(hose->dn, "linux,tce-base", NULL); sizep = of_get_property(hose->dn, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) { - pr_err("PCI: %s has missing tce entries !\n", hose->dn->full_name); + pr_err("PCI: %s has missing tce entries !\n", + hose->dn->full_name); return NULL; } tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose->node); @@ -323,6 +473,15 @@ pnv_pci_setup_bml_iommu(struct pci_controller *hose) pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), be32_to_cpup(sizep), 0); iommu_init_table(tbl, hose->node); + + /* Deal with SW invalidated TCEs when needed (BML way) */ + swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info", + NULL); + if (swinvp) { + tbl->it_busno = swinvp[1]; + tbl->it_index = (unsigned long)ioremap(swinvp[0], 8); + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; + } return tbl; } @@ -356,6 +515,13 @@ static void __devinit pnv_pci_dma_dev_setup(struct pci_dev *pdev) pnv_pci_dma_fallback_setup(hose, pdev); } +/* Fixup wrong class code in p7ioc root complex */ +static void __devinit pnv_p7ioc_rc_quirk(struct pci_dev *dev) +{ + dev->class = PCI_CLASS_BRIDGE_PCI << 8; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk); + static int pnv_pci_probe_mode(struct pci_bus *bus) { struct pci_controller *hose = pci_bus_to_host(bus); @@ -400,12 +566,24 @@ void __init pnv_pci_init(void) init_pci_config_tokens(); find_and_init_phbs(); #endif /* CONFIG_PPC_POWERNV_RTAS */ - } else { - /* OPAL is here, do our normal stuff */ + } + /* OPAL is here, do our normal stuff */ + else { + int found_ioda = 0; + + /* Look for IODA IO-Hubs. We don't support mixing IODA + * and p5ioc2 due to the need to change some global + * probing flags + */ + for_each_compatible_node(np, NULL, "ibm,ioda-hub") { + pnv_pci_init_ioda_hub(np); + found_ioda = 1; + } /* Look for p5ioc2 IO-Hubs */ - for_each_compatible_node(np, NULL, "ibm,p5ioc2") - pnv_pci_init_p5ioc2_hub(np); + if (!found_ioda) + for_each_compatible_node(np, NULL, "ibm,p5ioc2") + pnv_pci_init_p5ioc2_hub(np); } /* Setup the linkage between OF nodes and PHBs */ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index d4dbc495093..8bc47963464 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -9,9 +9,63 @@ enum pnv_phb_type { PNV_PHB_IODA2, }; +/* Precise PHB model for error management */ +enum pnv_phb_model { + PNV_PHB_MODEL_UNKNOWN, + PNV_PHB_MODEL_P5IOC2, + PNV_PHB_MODEL_P7IOC, +}; + +#define PNV_PCI_DIAG_BUF_SIZE 4096 + +/* Data associated with a PE, including IOMMU tracking etc.. */ +struct pnv_ioda_pe { + /* A PE can be associated with a single device or an + * entire bus (& children). In the former case, pdev + * is populated, in the later case, pbus is. + */ + struct pci_dev *pdev; + struct pci_bus *pbus; + + /* Effective RID (device RID for a device PE and base bus + * RID with devfn 0 for a bus PE) + */ + unsigned int rid; + + /* PE number */ + unsigned int pe_number; + + /* "Weight" assigned to the PE for the sake of DMA resource + * allocations + */ + unsigned int dma_weight; + + /* This is a PCI-E -> PCI-X bridge, this points to the + * corresponding bus PE + */ + struct pnv_ioda_pe *bus_pe; + + /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ + int tce32_seg; + int tce32_segcount; + struct iommu_table tce32_table; + + /* XXX TODO: Add support for additional 64-bit iommus */ + + /* MSIs. MVE index is identical for for 32 and 64 bit MSI + * and -1 if not supported. (It's actually identical to the + * PE number) + */ + int mve_number; + + /* Link in list of PE#s */ + struct list_head link; +}; + struct pnv_phb { struct pci_controller *hose; enum pnv_phb_type type; + enum pnv_phb_model model; u64 opal_id; void __iomem *regs; spinlock_t lock; @@ -34,7 +88,52 @@ struct pnv_phb { struct { struct iommu_table iommu_table; } p5ioc2; + + struct { + /* Global bridge info */ + unsigned int total_pe; + unsigned int m32_size; + unsigned int m32_segsize; + unsigned int m32_pci_base; + unsigned int io_size; + unsigned int io_segsize; + unsigned int io_pci_base; + + /* PE allocation bitmap */ + unsigned long *pe_alloc; + + /* M32 & IO segment maps */ + unsigned int *m32_segmap; + unsigned int *io_segmap; + struct pnv_ioda_pe *pe_array; + + /* Reverse map of PEs, will have to extend if + * we are to support more than 256 PEs, indexed + * bus { bus, devfn } + */ + unsigned char pe_rmap[0x10000]; + + /* 32-bit TCE tables allocation */ + unsigned long tce32_count; + + /* Total "weight" for the sake of DMA resources + * allocation + */ + unsigned int dma_weight; + unsigned int dma_pe_count; + + /* Sorted list of used PE's, sorted at + * boot for resource allocation purposes + */ + struct list_head pe_list; + } ioda; }; + + /* PHB status structure */ + union { + unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; + struct OpalIoP7IOCPhbErrorData p7ioc; + } diag; }; extern struct pci_ops pnv_pci_ops; @@ -43,6 +142,7 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset); extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); +extern void pnv_pci_init_ioda_hub(struct device_node *np); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index e8773668524..17210c526c5 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -75,7 +75,7 @@ int __devinit pnv_smp_kick_cpu(int nr) /* On OPAL v2 the CPU are still spinning inside OPAL itself, * get them back now */ - if (firmware_has_feature(FW_FEATURE_OPALv2)) { + if (!paca[nr].cpu_start && firmware_has_feature(FW_FEATURE_OPALv2)) { pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu); rc = opal_start_cpu(pcpu, start_here); if (rc != OPAL_SUCCESS) |