From 674bfa485554156aa90ce17288712fcb568a42c3 Mon Sep 17 00:00:00 2001 From: Suzuki Poulose Date: Mon, 18 Jul 2011 03:29:20 +0000 Subject: powerpc/44x: Kexec support for PPC440X chipsets This patch adds kexec support for PPC440 based chipsets. This work is based on the KEXEC patches for FSL BookE. The FSL BookE patch and the code flow could be found at the link below: http://patchwork.ozlabs.org/patch/49359/ Steps: 1) Invalidate all the TLB entries except the one this code is run from 2) Create a tmp mapping for our code in the other address space and jump to it 3) Invalidate the entry we used 4) Create a 1:1 mapping for 0-2GiB in blocks of 256M 5) Jump to the new 1:1 mapping and invalidate the tmp mapping I have tested this patches on Ebony, Sequoia boards and Virtex on QEMU. You need kexec-tools commit e8b7939b1e or newer for ppc440x support, available at: git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git Signed-off-by: Suzuki Poulose Cc: Sebastian Andrzej Siewior Signed-off-by: Josh Boyer --- arch/powerpc/include/asm/kexec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 8a33698c61b..f921eb121d3 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -2,7 +2,7 @@ #define _ASM_POWERPC_KEXEC_H #ifdef __KERNEL__ -#ifdef CONFIG_FSL_BOOKE +#if defined(CONFIG_FSL_BOOKE) || defined(CONFIG_44x) /* * On FSL-BookE we setup a 1:1 mapping which covers the first 2GiB of memory -- cgit v1.2.3-70-g09d2 From 6a5c7be5e484bda5b2639fedf7dbe3f25c15c962 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Fri, 24 Jun 2011 09:05:22 +0000 Subject: powerpc: Override dma_get_required_mask by platform hook and ops The hook dma_get_required_mask is supposed to return the mask required by the platform to operate efficently. The generic version of dma_get_required_mask in driver/base/platform.c returns a mask based only on max_pfn. However, this is likely too big for iommu systems and could be too small for platforms that require a dma offset or have a secondary window at a high offset. Override the default, provide a hook in ppc_md used by pseries lpar and cell, and provide the default answer based on memblock_end_of_DRAM(), with hooks for get_dma_offset, and provide an implementation for iommu that looks at the defined table size. Coverting from the end address to the required bit mask is based on the generic implementation. The need for this was discovered when the qla2xxx driver switched to 64 bit dma then reverted to 32 bit when dma_get_required_mask said 32 bits was sufficient. Signed-off-by: Milton Miller Signed-off-by: Nishanth Aravamudan Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Cc: benh@kernel.crashing.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/dma-mapping.h | 3 +++ arch/powerpc/include/asm/machdep.h | 3 ++- arch/powerpc/kernel/dma-iommu.c | 13 ++++++++++++ arch/powerpc/kernel/dma.c | 39 ++++++++++++++++++++++++++++++++++ arch/powerpc/platforms/cell/iommu.c | 12 +++++++++++ arch/powerpc/platforms/pseries/iommu.c | 27 +++++++++++++++++++++++ 6 files changed, 96 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index dd70fac57ec..8135e66a4bb 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -20,6 +20,8 @@ #define DMA_ERROR_CODE (~(dma_addr_t)0x0) +#define ARCH_HAS_DMA_GET_REQUIRED_MASK + /* Some dma direct funcs must be visible for use in other dma_ops */ extern void *dma_direct_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag); @@ -69,6 +71,7 @@ static inline unsigned long device_to_mask(struct device *dev) */ #ifdef CONFIG_PPC64 extern struct dma_map_ops dma_iommu_ops; +extern u64 dma_iommu_get_required_mask(struct device *dev); #endif extern struct dma_map_ops dma_direct_ops; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 47cacddb14c..58fc2162301 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -85,8 +85,9 @@ struct machdep_calls { void (*pci_dma_dev_setup)(struct pci_dev *dev); void (*pci_dma_bus_setup)(struct pci_bus *bus); - /* Platform set_dma_mask override */ + /* Platform set_dma_mask and dma_get_required_mask overrides */ int (*dma_set_mask)(struct device *dev, u64 dma_mask); + u64 (*dma_get_required_mask)(struct device *dev); int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index e7554154a6d..1f2a711a261 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -90,6 +90,19 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 1; } +u64 dma_iommu_get_required_mask(struct device *dev) +{ + struct iommu_table *tbl = get_iommu_table_base(dev); + u64 mask; + if (!tbl) + return 0; + + mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); + mask += mask - 1; + + return mask; +} + struct dma_map_ops dma_iommu_ops = { .alloc_coherent = dma_iommu_alloc_coherent, .free_coherent = dma_iommu_free_coherent, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 4f0959fbfbe..503093efa20 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -170,6 +170,45 @@ int dma_set_mask(struct device *dev, u64 dma_mask) } EXPORT_SYMBOL(dma_set_mask); +u64 dma_get_required_mask(struct device *dev) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + u64 mask, end = 0; + + if (ppc_md.dma_get_required_mask) + return ppc_md.dma_get_required_mask(dev); + + if (unlikely(dma_ops == NULL)) + return 0; + +#ifdef CONFIG_PPC64 + else if (dma_ops == &dma_iommu_ops) + return dma_iommu_get_required_mask(dev); +#endif +#ifdef CONFIG_SWIOTLB + else if (dma_ops == &swiotlb_dma_ops) { + u64 max_direct_dma_addr = dev->archdata.max_direct_dma_addr; + + end = memblock_end_of_DRAM(); + if (max_direct_dma_addr && end > max_direct_dma_addr) + end = max_direct_dma_addr; + end += get_dma_offset(dev); + } +#endif + else if (dma_ops == &dma_direct_ops) + end = memblock_end_of_DRAM() + get_dma_offset(dev); + else { + WARN_ONCE(1, "%s: unknown ops %p\n", __func__, dma_ops); + end = memblock_end_of_DRAM(); + } + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} +EXPORT_SYMBOL_GPL(dma_get_required_mask); + static int __init dma_init(void) { dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 26a067122a5..5ef55f3b098 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -1159,6 +1159,17 @@ static int __init setup_iommu_fixed(char *str) } __setup("iommu_fixed=", setup_iommu_fixed); +static u64 cell_dma_get_required_mask(struct device *dev) +{ + if (!dev->dma_mask) + return 0; + + if (iommu_fixed_disabled && get_dma_ops(dev) == &dma_iommu_ops) + return dma_iommu_get_required_mask(dev); + + return DMA_BIT_MASK(64); +} + static int __init cell_iommu_init(void) { struct device_node *np; @@ -1175,6 +1186,7 @@ static int __init cell_iommu_init(void) /* Setup various ppc_md. callbacks */ ppc_md.pci_dma_dev_setup = cell_pci_dma_dev_setup; + ppc_md.dma_get_required_mask = cell_dma_get_required_mask; ppc_md.tce_build = tce_build_cell; ppc_md.tce_free = tce_free_cell; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 01faab9456c..fe5ededf0d6 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1077,12 +1077,38 @@ check_mask: return 0; } +static u64 dma_get_required_mask_pSeriesLP(struct device *dev) +{ + if (!dev->dma_mask) + return 0; + + if (!disable_ddw && dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + struct device_node *dn; + + dn = pci_device_to_OF_node(pdev); + + /* search upwards for ibm,dma-window */ + for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table; + dn = dn->parent) + if (of_get_property(dn, "ibm,dma-window", NULL)) + break; + /* if there is a ibm,ddw-applicable property require 64 bits */ + if (dn && PCI_DN(dn) && + of_get_property(dn, "ibm,ddw-applicable", NULL)) + return DMA_BIT_MASK(64); + } + + return dma_iommu_get_required_mask(dev); +} + #else /* CONFIG_PCI */ #define pci_dma_bus_setup_pSeries NULL #define pci_dma_dev_setup_pSeries NULL #define pci_dma_bus_setup_pSeriesLP NULL #define pci_dma_dev_setup_pSeriesLP NULL #define dma_set_mask_pSeriesLP NULL +#define dma_get_required_mask_pSeriesLP NULL #endif /* !CONFIG_PCI */ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, @@ -1186,6 +1212,7 @@ void iommu_init_early_pSeries(void) ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; + ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; } else { ppc_md.tce_build = tce_build_pSeries; ppc_md.tce_free = tce_free_pSeries; -- cgit v1.2.3-70-g09d2 From d24f9c6999eacd3a7bc2b289e49fcb2bf2fafef2 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Fri, 24 Jun 2011 09:05:24 +0000 Subject: powerpc: Use the newly added get_required_mask dma_map_ops hook Now that the generic code has dma_map_ops set, instead of having a messy ifdef & if block in the base dma_get_required_mask hook push the computation into the dma ops. If the ops fails to set the get_required_mask hook default to the width of dma_addr_t. This also corrects ibmbus ibmebus_dma_supported to require a 64 bit mask. I doubt anything is checking or setting the dma mask on that bus. Signed-off-by: Milton Miller Signed-off-by: Nishanth Aravamudan Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Cc: benh@kernel.crashing.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/device.h | 2 ++ arch/powerpc/include/asm/dma-mapping.h | 3 --- arch/powerpc/kernel/dma-iommu.c | 3 ++- arch/powerpc/kernel/dma-swiotlb.c | 16 +++++++++++++ arch/powerpc/kernel/dma.c | 41 +++++++++++++-------------------- arch/powerpc/kernel/ibmebus.c | 8 ++++++- arch/powerpc/kernel/vio.c | 7 +++++- arch/powerpc/platforms/cell/iommu.c | 13 +++++++++-- arch/powerpc/platforms/ps3/system-bus.c | 7 ++++++ arch/powerpc/platforms/pseries/iommu.c | 2 +- 10 files changed, 68 insertions(+), 34 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 16d25c0974b..d57c08acedf 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -37,4 +37,6 @@ struct pdev_archdata { u64 dma_mask; }; +#define ARCH_HAS_DMA_GET_REQUIRED_MASK + #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 8135e66a4bb..dd70fac57ec 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -20,8 +20,6 @@ #define DMA_ERROR_CODE (~(dma_addr_t)0x0) -#define ARCH_HAS_DMA_GET_REQUIRED_MASK - /* Some dma direct funcs must be visible for use in other dma_ops */ extern void *dma_direct_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag); @@ -71,7 +69,6 @@ static inline unsigned long device_to_mask(struct device *dev) */ #ifdef CONFIG_PPC64 extern struct dma_map_ops dma_iommu_ops; -extern u64 dma_iommu_get_required_mask(struct device *dev); #endif extern struct dma_map_ops dma_direct_ops; diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 1f2a711a261..c1ad9db934f 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -90,7 +90,7 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 1; } -u64 dma_iommu_get_required_mask(struct device *dev) +static u64 dma_iommu_get_required_mask(struct device *dev) { struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; @@ -111,5 +111,6 @@ struct dma_map_ops dma_iommu_ops = { .dma_supported = dma_iommu_dma_supported, .map_page = dma_iommu_map_page, .unmap_page = dma_iommu_unmap_page, + .get_required_mask = dma_iommu_get_required_mask, }; EXPORT_SYMBOL(dma_iommu_ops); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 4295e0b94b2..1ebc9189aad 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -24,6 +24,21 @@ unsigned int ppc_swiotlb_enable; +static u64 swiotlb_powerpc_get_required(struct device *dev) +{ + u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr; + + end = memblock_end_of_DRAM(); + if (max_direct_dma_addr && end > max_direct_dma_addr) + end = max_direct_dma_addr; + end += get_dma_offset(dev); + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} + /* * At the moment, all platforms that use this code only require * swiotlb to be used if we're operating on HIGHMEM. Since @@ -44,6 +59,7 @@ struct dma_map_ops swiotlb_dma_ops = { .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, .mapping_error = swiotlb_dma_mapping_error, + .get_required_mask = swiotlb_powerpc_get_required, }; void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 503093efa20..10b136afbf5 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -96,6 +96,18 @@ static int dma_direct_dma_supported(struct device *dev, u64 mask) #endif } +static u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 end, mask; + + end = memblock_end_of_DRAM() + get_dma_offset(dev); + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} + static inline dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, @@ -144,6 +156,7 @@ struct dma_map_ops dma_direct_ops = { .dma_supported = dma_direct_dma_supported, .map_page = dma_direct_map_page, .unmap_page = dma_direct_unmap_page, + .get_required_mask = dma_direct_get_required_mask, #ifdef CONFIG_NOT_COHERENT_CACHE .sync_single_for_cpu = dma_direct_sync_single, .sync_single_for_device = dma_direct_sync_single, @@ -173,7 +186,6 @@ EXPORT_SYMBOL(dma_set_mask); u64 dma_get_required_mask(struct device *dev) { struct dma_map_ops *dma_ops = get_dma_ops(dev); - u64 mask, end = 0; if (ppc_md.dma_get_required_mask) return ppc_md.dma_get_required_mask(dev); @@ -181,31 +193,10 @@ u64 dma_get_required_mask(struct device *dev) if (unlikely(dma_ops == NULL)) return 0; -#ifdef CONFIG_PPC64 - else if (dma_ops == &dma_iommu_ops) - return dma_iommu_get_required_mask(dev); -#endif -#ifdef CONFIG_SWIOTLB - else if (dma_ops == &swiotlb_dma_ops) { - u64 max_direct_dma_addr = dev->archdata.max_direct_dma_addr; - - end = memblock_end_of_DRAM(); - if (max_direct_dma_addr && end > max_direct_dma_addr) - end = max_direct_dma_addr; - end += get_dma_offset(dev); - } -#endif - else if (dma_ops == &dma_direct_ops) - end = memblock_end_of_DRAM() + get_dma_offset(dev); - else { - WARN_ONCE(1, "%s: unknown ops %p\n", __func__, dma_ops); - end = memblock_end_of_DRAM(); - } + if (dma_ops->get_required_mask) + return dma_ops->get_required_mask(dev); - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; + return DMA_BIT_MASK(8 * sizeof(dma_addr_t)); } EXPORT_SYMBOL_GPL(dma_get_required_mask); diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index 28581f1ad2c..90ef2a44613 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -125,7 +125,12 @@ static void ibmebus_unmap_sg(struct device *dev, static int ibmebus_dma_supported(struct device *dev, u64 mask) { - return 1; + return mask == DMA_BIT_MASK(64); +} + +static u64 ibmebus_dma_get_required_mask(struct device *dev) +{ + return DMA_BIT_MASK(64); } static struct dma_map_ops ibmebus_dma_ops = { @@ -134,6 +139,7 @@ static struct dma_map_ops ibmebus_dma_ops = { .map_sg = ibmebus_map_sg, .unmap_sg = ibmebus_unmap_sg, .dma_supported = ibmebus_dma_supported, + .get_required_mask = ibmebus_dma_get_required_mask, .map_page = ibmebus_map_page, .unmap_page = ibmebus_unmap_page, }; diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 1b695fdc362..c0493259d13 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -605,6 +605,11 @@ static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask) return dma_iommu_ops.dma_supported(dev, mask); } +static u64 vio_dma_get_required_mask(struct device *dev) +{ + return dma_iommu_ops.get_required_mask(dev); +} + struct dma_map_ops vio_dma_mapping_ops = { .alloc_coherent = vio_dma_iommu_alloc_coherent, .free_coherent = vio_dma_iommu_free_coherent, @@ -613,7 +618,7 @@ struct dma_map_ops vio_dma_mapping_ops = { .map_page = vio_dma_iommu_map_page, .unmap_page = vio_dma_iommu_unmap_page, .dma_supported = vio_dma_iommu_dma_supported, - + .get_required_mask = vio_dma_get_required_mask, }; /** diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 5ef55f3b098..fc46fcac392 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -1161,11 +1161,20 @@ __setup("iommu_fixed=", setup_iommu_fixed); static u64 cell_dma_get_required_mask(struct device *dev) { + struct dma_map_ops *dma_ops; + if (!dev->dma_mask) return 0; - if (iommu_fixed_disabled && get_dma_ops(dev) == &dma_iommu_ops) - return dma_iommu_get_required_mask(dev); + if (!iommu_fixed_disabled && + cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) + return DMA_BIT_MASK(64); + + dma_ops = get_dma_ops(dev); + if (dma_ops->get_required_mask) + return dma_ops->get_required_mask(dev); + + WARN_ONCE(1, "no get_required_mask in %p ops", dma_ops); return DMA_BIT_MASK(64); } diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 23083c39752..688141c76e0 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -695,12 +695,18 @@ static int ps3_dma_supported(struct device *_dev, u64 mask) return mask >= DMA_BIT_MASK(32); } +static u64 ps3_dma_get_required_mask(struct device *_dev) +{ + return DMA_BIT_MASK(32); +} + static struct dma_map_ops ps3_sb_dma_ops = { .alloc_coherent = ps3_alloc_coherent, .free_coherent = ps3_free_coherent, .map_sg = ps3_sb_map_sg, .unmap_sg = ps3_sb_unmap_sg, .dma_supported = ps3_dma_supported, + .get_required_mask = ps3_dma_get_required_mask, .map_page = ps3_sb_map_page, .unmap_page = ps3_unmap_page, }; @@ -711,6 +717,7 @@ static struct dma_map_ops ps3_ioc0_dma_ops = { .map_sg = ps3_ioc0_map_sg, .unmap_sg = ps3_ioc0_unmap_sg, .dma_supported = ps3_dma_supported, + .get_required_mask = ps3_dma_get_required_mask, .map_page = ps3_ioc0_map_page, .unmap_page = ps3_unmap_page, }; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index fe5ededf0d6..9f121a37eb5 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1099,7 +1099,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev) return DMA_BIT_MASK(64); } - return dma_iommu_get_required_mask(dev); + return dma_iommu_ops.get_required_mask(dev); } #else /* CONFIG_PCI */ -- cgit v1.2.3-70-g09d2 From 41151e77a4d96ea138cede6d84c955aa4769ce74 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Tue, 28 Jun 2011 09:54:48 +0000 Subject: powerpc: Hugetlb for BookE Enable hugepages on Freescale BookE processors. This allows the kernel to use huge TLB entries to map pages, which can greatly reduce the number of TLB misses and the amount of TLB thrashing experienced by applications with large memory footprints. Care should be taken when using this on FSL processors, as the number of large TLB entries supported by the core is low (16-64) on current processors. The supported set of hugepage sizes include 4m, 16m, 64m, 256m, and 1g. Page sizes larger than the max zone size are called "gigantic" pages and must be allocated on the command line (and cannot be deallocated). This is currently only fully implemented for Freescale 32-bit BookE processors, but there is some infrastructure in the code for 64-bit BooKE. Signed-off-by: Becky Bruce Signed-off-by: David Gibson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 3 +- arch/powerpc/include/asm/hugetlb.h | 63 +++++- arch/powerpc/include/asm/mmu-book3e.h | 7 + arch/powerpc/include/asm/mmu-hash64.h | 3 +- arch/powerpc/include/asm/mmu.h | 18 +- arch/powerpc/include/asm/page.h | 31 ++- arch/powerpc/include/asm/page_64.h | 11 - arch/powerpc/include/asm/pte-book3e.h | 3 + arch/powerpc/kernel/head_fsl_booke.S | 133 ++++++++++-- arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/hash_utils_64.c | 3 - arch/powerpc/mm/hugetlbpage-book3e.c | 121 +++++++++++ arch/powerpc/mm/hugetlbpage.c | 379 +++++++++++++++++++++++++++++---- arch/powerpc/mm/init_32.c | 9 + arch/powerpc/mm/mem.c | 5 + arch/powerpc/mm/mmu_context_nohash.c | 5 + arch/powerpc/mm/pgtable.c | 3 +- arch/powerpc/mm/tlb_low_64e.S | 24 +-- arch/powerpc/mm/tlb_nohash.c | 46 +++- arch/powerpc/platforms/Kconfig.cputype | 4 +- 20 files changed, 766 insertions(+), 106 deletions(-) create mode 100644 arch/powerpc/mm/hugetlbpage-book3e.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0a3d5560c9b..62711421cd6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -429,8 +429,7 @@ config ARCH_POPULATES_NODE_MAP def_bool y config SYS_SUPPORTS_HUGETLBFS - def_bool y - depends on PPC_BOOK3S_64 + bool source "mm/Kconfig" diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 5856a66ab40..86004930a78 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -1,15 +1,60 @@ #ifndef _ASM_POWERPC_HUGETLB_H #define _ASM_POWERPC_HUGETLB_H +#ifdef CONFIG_HUGETLB_PAGE #include +extern struct kmem_cache *hugepte_cache; +extern void __init reserve_hugetlb_gpages(void); + +static inline pte_t *hugepd_page(hugepd_t hpd) +{ + BUG_ON(!hugepd_ok(hpd)); + return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | PD_HUGE); +} + +static inline unsigned int hugepd_shift(hugepd_t hpd) +{ + return hpd.pd & HUGEPD_SHIFT_MASK; +} + +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, + unsigned pdshift) +{ + /* + * On 32-bit, we have multiple higher-level table entries that point to + * the same hugepte. Just use the first one since they're all + * identical. So for that case, idx=0. + */ + unsigned long idx = 0; + + pte_t *dir = hugepd_page(*hpdp); +#ifdef CONFIG_PPC64 + idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); +#endif + + return dir + idx; +} + pte_t *huge_pte_offset_and_shift(struct mm_struct *mm, unsigned long addr, unsigned *shift); void flush_dcache_icache_hugepage(struct page *page); +#if defined(CONFIG_PPC_MM_SLICES) || defined(CONFIG_PPC_SUBPAGE_PROT) int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len); +#else +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + return 0; +} +#endif + +void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte); +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, @@ -50,8 +95,11 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); - return __pte(old); +#ifdef CONFIG_PPC64 + return __pte(pte_update(mm, addr, ptep, ~0UL, 1)); +#else + return __pte(pte_update(ptep, ~0UL, 0)); +#endif } static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -93,4 +141,15 @@ static inline void arch_release_hugepage(struct page *page) { } +#else /* ! CONFIG_HUGETLB_PAGE */ +static inline void reserve_hugetlb_gpages(void) +{ + pr_err("Cannot reserve gpages without hugetlb enabled\n"); +} +static inline void flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ +} +#endif + #endif /* _ASM_POWERPC_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 3ea0f9a259d..0260ea5ec3c 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -66,6 +66,7 @@ #define MAS2_M 0x00000004 #define MAS2_G 0x00000002 #define MAS2_E 0x00000001 +#define MAS2_WIMGE_MASK 0x0000001f #define MAS2_EPN_MASK(size) (~0 << (size + 10)) #define MAS2_VAL(addr, size, flags) ((addr) & MAS2_EPN_MASK(size) | (flags)) @@ -80,6 +81,7 @@ #define MAS3_SW 0x00000004 #define MAS3_UR 0x00000002 #define MAS3_SR 0x00000001 +#define MAS3_BAP_MASK 0x0000003f #define MAS3_SPSIZE 0x0000003e #define MAS3_SPSIZE_SHIFT 1 @@ -212,6 +214,11 @@ typedef struct { unsigned int id; unsigned int active; unsigned long vdso_base; +#ifdef CONFIG_PPC_MM_SLICES + u64 low_slices_psize; /* SLB page size encodings */ + u64 high_slices_psize; /* 4 bits per slice for now */ + u16 user_psize; /* page size index */ +#endif } mm_context_t; /* Page size definitions, common between 32 and 64-bit diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index b445e0af4c2..db645ec842b 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -262,8 +262,7 @@ extern void hash_failure_debug(unsigned long ea, unsigned long access, extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long pstart, unsigned long prot, int psize, int ssize); -extern void add_gpage(unsigned long addr, unsigned long page_size, - unsigned long number_of_pages); +extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); extern void hpte_init_native(void); diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 698b3063868..f0145522cfb 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -175,14 +175,16 @@ extern u64 ppc64_rma_size; #define MMU_PAGE_64K_AP 3 /* "Admixed pages" (hash64 only) */ #define MMU_PAGE_256K 4 #define MMU_PAGE_1M 5 -#define MMU_PAGE_8M 6 -#define MMU_PAGE_16M 7 -#define MMU_PAGE_256M 8 -#define MMU_PAGE_1G 9 -#define MMU_PAGE_16G 10 -#define MMU_PAGE_64G 11 -#define MMU_PAGE_COUNT 12 - +#define MMU_PAGE_4M 6 +#define MMU_PAGE_8M 7 +#define MMU_PAGE_16M 8 +#define MMU_PAGE_64M 9 +#define MMU_PAGE_256M 10 +#define MMU_PAGE_1G 11 +#define MMU_PAGE_16G 12 +#define MMU_PAGE_64G 13 + +#define MMU_PAGE_COUNT 14 #if defined(CONFIG_PPC_STD_MMU_64) /* 64-bit classic hash table MMU */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 2cd664ef0a5..dd9c4fd038e 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -36,6 +36,18 @@ #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +extern unsigned int HPAGE_SHIFT; +#else +#define HPAGE_SHIFT PAGE_SHIFT +#endif +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +#define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1) +#endif + /* We do define AT_SYSINFO_EHDR but don't use the gate mechanism */ #define __HAVE_ARCH_GATE_AREA 1 @@ -158,6 +170,24 @@ extern phys_addr_t kernstart_addr; #define is_kernel_addr(x) ((x) >= PAGE_OFFSET) #endif +/* + * Use the top bit of the higher-level page table entries to indicate whether + * the entries we point to contain hugepages. This works because we know that + * the page tables live in kernel space. If we ever decide to support having + * page tables at arbitrary addresses, this breaks and will have to change. + */ +#ifdef CONFIG_PPC64 +#define PD_HUGE 0x8000000000000000 +#else +#define PD_HUGE 0x80000000 +#endif + +/* + * Some number of bits at the level of the page table that points to + * a hugepte are used to encode the size. This masks those bits. + */ +#define HUGEPD_SHIFT_MASK 0x3f + #ifndef __ASSEMBLY__ #undef STRICT_MM_TYPECHECKS @@ -243,7 +273,6 @@ typedef unsigned long pgprot_t; #endif typedef struct { signed long pd; } hugepd_t; -#define HUGEPD_SHIFT_MASK 0x3f #ifdef CONFIG_HUGETLB_PAGE static inline int hugepd_ok(hugepd_t hpd) diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 9356262fd3c..fb40ede6bc0 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -64,17 +64,6 @@ extern void copy_page(void *to, void *from); /* Log 2 of page table size */ extern u64 ppc64_pft_size; -/* Large pages size */ -#ifdef CONFIG_HUGETLB_PAGE -extern unsigned int HPAGE_SHIFT; -#else -#define HPAGE_SHIFT PAGE_SHIFT -#endif -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) -#define HPAGE_MASK (~(HPAGE_SIZE - 1)) -#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1) - #endif /* __ASSEMBLY__ */ #ifdef CONFIG_PPC_MM_SLICES diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 082d515930a..0156702ba24 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -72,6 +72,9 @@ #define PTE_RPN_SHIFT (24) #endif +#define PTE_WIMGE_SHIFT (19) +#define PTE_BAP_SHIFT (2) + /* On 32-bit, we never clear the top part of the PTE */ #ifdef CONFIG_PPC32 #define _PTE_NONE_MASK 0xffffffff00000000ULL diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 50845924b7d..4ea9bfbf67e 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -236,8 +236,24 @@ _ENTRY(__early_start) * if we find the pte (fall through): * r11 is low pte word * r12 is pointer to the pte + * r10 is the pshift from the PGD, if we're a hugepage */ #ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_HUGETLB_PAGE +#define FIND_PTE \ + rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ + lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ + rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ + blt 1000f; /* Normal non-huge page */ \ + beq 2f; /* Bail if no table */ \ + oris r11, r11, PD_HUGE@h; /* Put back address bit */ \ + andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \ + xor r12, r10, r11; /* drop size bits from pointer */ \ + b 1001f; \ +1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ + li r10, 0; /* clear r10 */ \ +1001: lwz r11, 4(r12); /* Get pte entry */ +#else #define FIND_PTE \ rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ @@ -245,7 +261,8 @@ _ENTRY(__early_start) beq 2f; /* Bail if no table */ \ rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ lwz r11, 4(r12); /* Get pte entry */ -#else +#endif /* HUGEPAGE */ +#else /* !PTE_64BIT */ #define FIND_PTE \ rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \ lwz r11, 0(r11); /* Get L1 entry */ \ @@ -402,8 +419,8 @@ interrupt_base: #ifdef CONFIG_PTE_64BIT #ifdef CONFIG_SMP - subf r10,r11,r12 /* create false data dep */ - lwzx r13,r11,r10 /* Get upper pte bits */ + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ #else lwz r13,0(r12) /* Get upper pte bits */ #endif @@ -483,8 +500,8 @@ interrupt_base: #ifdef CONFIG_PTE_64BIT #ifdef CONFIG_SMP - subf r10,r11,r12 /* create false data dep */ - lwzx r13,r11,r10 /* Get upper pte bits */ + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ #else lwz r13,0(r12) /* Get upper pte bits */ #endif @@ -548,7 +565,7 @@ interrupt_base: /* * Both the instruction and data TLB miss get to this * point to load the TLB. - * r10 - available to use + * r10 - tsize encoding (if HUGETLB_PAGE) or available to use * r11 - TLB (info from Linux PTE) * r12 - available to use * r13 - upper bits of PTE (if PTE_64BIT) or available to use @@ -558,21 +575,73 @@ interrupt_base: * Upon exit, we reload everything and RFI. */ finish_tlb_load: +#ifdef CONFIG_HUGETLB_PAGE + cmpwi 6, r10, 0 /* check for huge page */ + beq 6, finish_tlb_load_cont /* !huge */ + + /* Alas, we need more scratch registers for hugepages */ + mfspr r12, SPRN_SPRG_THREAD + stw r14, THREAD_NORMSAVE(4)(r12) + stw r15, THREAD_NORMSAVE(5)(r12) + stw r16, THREAD_NORMSAVE(6)(r12) + stw r17, THREAD_NORMSAVE(7)(r12) + + /* Get the next_tlbcam_idx percpu var */ +#ifdef CONFIG_SMP + lwz r12, THREAD_INFO-THREAD(r12) + lwz r15, TI_CPU(r12) + lis r14, __per_cpu_offset@h + ori r14, r14, __per_cpu_offset@l + rlwinm r15, r15, 2, 0, 29 + lwzx r16, r14, r15 +#else + li r16, 0 +#endif + lis r17, next_tlbcam_idx@h + ori r17, r17, next_tlbcam_idx@l + add r17, r17, r16 /* r17 = *next_tlbcam_idx */ + lwz r15, 0(r17) /* r15 = next_tlbcam_idx */ + + lis r14, MAS0_TLBSEL(1)@h /* select TLB1 (TLBCAM) */ + rlwimi r14, r15, 16, 4, 15 /* next_tlbcam_idx entry */ + mtspr SPRN_MAS0, r14 + + /* Extract TLB1CFG(NENTRY) */ + mfspr r16, SPRN_TLB1CFG + andi. r16, r16, 0xfff + + /* Update next_tlbcam_idx, wrapping when necessary */ + addi r15, r15, 1 + cmpw r15, r16 + blt 100f + lis r14, tlbcam_index@h + ori r14, r14, tlbcam_index@l + lwz r15, 0(r14) +100: stw r15, 0(r17) + + /* + * Calc MAS1_TSIZE from r10 (which has pshift encoded) + * tlb_enc = (pshift - 10). + */ + subi r15, r10, 10 + mfspr r16, SPRN_MAS1 + rlwimi r16, r15, 7, 20, 24 + mtspr SPRN_MAS1, r16 + + /* copy the pshift for use later */ + mr r14, r10 + + /* fall through */ + +#endif /* CONFIG_HUGETLB_PAGE */ + /* * We set execute, because we don't have the granularity to * properly set this at the page level (Linux problem). * Many of these bits are software only. Bits we don't set * here we (properly should) assume have the appropriate value. */ - - mfspr r12, SPRN_MAS2 -#ifdef CONFIG_PTE_64BIT - rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ -#else - rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ -#endif - mtspr SPRN_MAS2, r12 - +finish_tlb_load_cont: #ifdef CONFIG_PTE_64BIT rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */ andi. r10, r11, _PAGE_DIRTY @@ -581,22 +650,40 @@ finish_tlb_load: andc r12, r12, r10 1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */ rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */ - mtspr SPRN_MAS3, r12 +2: mtspr SPRN_MAS3, r12 BEGIN_MMU_FTR_SECTION srwi r10, r13, 12 /* grab RPN[12:31] */ mtspr SPRN_MAS7, r10 END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) #else li r10, (_PAGE_EXEC | _PAGE_PRESENT) + mr r13, r11 rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ and r12, r11, r10 andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ slwi r10, r12, 1 or r10, r10, r12 iseleq r12, r12, r10 - rlwimi r11, r12, 0, 20, 31 /* Extract RPN from PTE and merge with perms */ - mtspr SPRN_MAS3, r11 + rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ + mtspr SPRN_MAS3, r13 #endif + + mfspr r12, SPRN_MAS2 +#ifdef CONFIG_PTE_64BIT + rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ +#else + rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ +#endif +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 3f /* don't mask if page isn't huge */ + li r13, 1 + slw r13, r13, r14 + subi r13, r13, 1 + rlwinm r13, r13, 0, 0, 19 /* bottom bits used for WIMGE/etc */ + andc r12, r12, r13 /* mask off ea bits within the page */ +#endif +3: mtspr SPRN_MAS2, r12 + #ifdef CONFIG_E200 /* Round robin TLB1 entries assignment */ mfspr r12, SPRN_MAS0 @@ -622,11 +709,19 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) mtspr SPRN_MAS0,r12 #endif /* CONFIG_E200 */ +tlb_write_entry: tlbwe /* Done...restore registers and get out of here. */ mfspr r10, SPRN_SPRG_THREAD - lwz r11, THREAD_NORMSAVE(3)(r10) +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 8f /* skip restore for 4k page faults */ + lwz r14, THREAD_NORMSAVE(4)(r10) + lwz r15, THREAD_NORMSAVE(5)(r10) + lwz r16, THREAD_NORMSAVE(6)(r10) + lwz r17, THREAD_NORMSAVE(7)(r10) +#endif +8: lwz r11, THREAD_NORMSAVE(3)(r10) mtcr r11 lwz r13, THREAD_NORMSAVE(2)(r10) lwz r12, THREAD_NORMSAVE(1)(r10) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index bdca46e0838..991ee813d2a 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-y += hugetlbpage.o obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 26b2872b3d0..1f8b2a05e3d 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -105,9 +105,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M; int mmu_highuser_ssize = MMU_SEGSIZE_256M; u16 mmu_slb_size = 64; EXPORT_SYMBOL_GPL(mmu_slb_size); -#ifdef CONFIG_HUGETLB_PAGE -unsigned int HPAGE_SHIFT; -#endif #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c new file mode 100644 index 00000000000..1295b7c1cda --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -0,0 +1,121 @@ +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include +#include + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { + asm volatile( + "li %0,0\n" + "tlbsx. 0,%1\n" + "bne 1f\n" + "li %0,1\n" + "1:\n" + : "=&r"(found) : "r"(ea)); + } else { + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + } + + return found; +} + +void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int index, lz, ncams; + struct vm_area_struct *vma; +#endif + + if (unlikely(is_kernel_addr(ea))) + return; + +#ifdef CONFIG_MM_SLICES + psize = mmu_get_tsize(get_slice_psize(mm, ea)); + tsize = mmu_get_psize(psize); + shift = mmu_psize_defs[psize].shift; +#else + vma = find_vma(mm, ea); + psize = vma_mmu_pagesize(vma); /* returns actual size in bytes */ + asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (psize)); + shift = 31 - lz; + tsize = 21 - lz; +#endif + + /* + * We can't be interrupted while we're setting up the MAS + * regusters or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + local_irq_restore(flags); + return; + } + +#ifdef CONFIG_PPC_FSL_BOOK3E + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = __get_cpu_var(next_tlbcam_idx); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __get_cpu_var(next_tlbcam_idx) = tlbcam_index; + else + __get_cpu_var(next_tlbcam_idx)++; +#endif + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { + mtspr(SPRN_MAS7_MAS3, mas7_3); + } else { + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + } + + asm volatile ("tlbwe"); + + local_irq_restore(flags); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0); + +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b..3a5f59dcbb3 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1,7 +1,8 @@ /* - * PPC64 (POWER4) Huge TLB Page Support for Kernel. + * PPC Huge TLB Page Support for Kernel. * * Copyright (C) 2003 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor * * Based on the IA-32 version: * Copyright (C) 2002, Rohit Seth @@ -11,24 +12,39 @@ #include #include #include +#include +#include +#include #include #include #include +#include #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_16M 24 #define PAGE_SHIFT_16G 34 -#define MAX_NUMBER_GPAGES 1024 +unsigned int HPAGE_SHIFT; -/* Tracks the 16G pages after the device tree is scanned and before the - * huge_boot_pages list is ready. */ -static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; +/* + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready. On 64-bit implementations, this is + * just used to track 16G pages and so is a single array. 32-bit + * implementations may have more than one gpage size due to limitations + * of the memory allocators, so we need multiple arrays + */ +#ifdef CONFIG_PPC64 +#define MAX_NUMBER_GPAGES 1024 +static u64 gpage_freearray[MAX_NUMBER_GPAGES]; static unsigned nr_gpages; - -/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() - * will choke on pointers to hugepte tables, which is handy for - * catching screwups early. */ +#else +#define MAX_NUMBER_GPAGES 128 +struct psize_gpages { + u64 gpage_list[MAX_NUMBER_GPAGES]; + unsigned int nr_gpages; +}; +static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; +#endif static inline int shift_to_mmu_psize(unsigned int shift) { @@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) #define hugepd_none(hpd) ((hpd).pd == 0) -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - BUG_ON(!hugepd_ok(hpd)); - return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return hpd.pd & HUGEPD_SHIFT_MASK; -} - -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) -{ - unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); - pte_t *dir = hugepd_page(*hpdp); - - return dir + idx; -} - pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) { pgd_t *pg; @@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift if (is_hugepd(pm)) hpdp = (hugepd_t *)pm; else if (!pmd_none(*pm)) { - return pte_offset_map(pm, ea); + return pte_offset_kernel(pm, ea); } } } @@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, unsigned long address, unsigned pdshift, unsigned pshift) { - pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), - GFP_KERNEL|__GFP_REPEAT); + struct kmem_cache *cachep; + pte_t *new; + +#ifdef CONFIG_PPC64 + cachep = PGT_CACHE(pdshift - pshift); +#else + int i; + int num_hugepd = 1 << (pshift - pdshift); + cachep = hugepte_cache; +#endif + + new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); @@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, return -ENOMEM; spin_lock(&mm->page_table_lock); +#ifdef CONFIG_PPC64 if (!hugepd_none(*hpdp)) - kmem_cache_free(PGT_CACHE(pdshift - pshift), new); + kmem_cache_free(cachep, new); else - hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; +#else + /* + * We have multiple higher-level entries that point to the same + * actual pte location. Fill in each as we go and backtrack on error. + * We need all of these so the DTLB pgtable walk code can find the + * right higher-level entry without knowing if it's a hugepage or not. + */ + for (i = 0; i < num_hugepd; i++, hpdp++) { + if (unlikely(!hugepd_none(*hpdp))) + break; + else + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; + } + /* If we bailed from the for loop early, an error occurred, clean up */ + if (i < num_hugepd) { + for (i = i - 1 ; i >= 0; i--, hpdp--) + hpdp->pd = 0; + kmem_cache_free(cachep, new); + } +#endif spin_unlock(&mm->page_table_lock); return 0; } @@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz return hugepte_offset(hpdp, addr, pdshift); } +#ifdef CONFIG_PPC32 /* Build list of addresses of gigantic pages. This function is used in early * boot before the buddy or bootmem allocator is setup. */ -void add_gpage(unsigned long addr, unsigned long page_size, - unsigned long number_of_pages) +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ + unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); + int i; + + if (addr == 0) + return; + + gpage_freearray[idx].nr_gpages = number_of_pages; + + for (i = 0; i < number_of_pages; i++) { + gpage_freearray[idx].gpage_list[i] = addr; + addr += page_size; + } +} + +/* + * Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT); + int nr_gpages = gpage_freearray[idx].nr_gpages; + + if (nr_gpages == 0) + return 0; + +#ifdef CONFIG_HIGHMEM + /* + * If gpages can be in highmem we can't use the trick of storing the + * data structure in the page; allocate space for this + */ + m = alloc_bootmem(sizeof(struct huge_bootmem_page)); + m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; +#else + m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); +#endif + + list_add(&m->list, &huge_boot_pages); + gpage_freearray[idx].nr_gpages = nr_gpages; + gpage_freearray[idx].gpage_list[nr_gpages] = 0; + m->hstate = hstate; + + return 1; +} +/* + * Scan the command line hugepagesz= options for gigantic pages; store those in + * a list that we use to allocate the memory once all options are parsed. + */ + +unsigned long gpage_npages[MMU_PAGE_COUNT]; + +static int __init do_gpage_early_setup(char *param, char *val) +{ + static phys_addr_t size; + unsigned long npages; + + /* + * The hugepagesz and hugepages cmdline options are interleaved. We + * use the size variable to keep track of whether or not this was done + * properly and skip over instances where it is incorrect. Other + * command-line parsing code will issue warnings, so we don't need to. + * + */ + if ((strcmp(param, "default_hugepagesz") == 0) || + (strcmp(param, "hugepagesz") == 0)) { + size = memparse(val, NULL); + } else if (strcmp(param, "hugepages") == 0) { + if (size != 0) { + if (sscanf(val, "%lu", &npages) <= 0) + npages = 0; + gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; + size = 0; + } + } + return 0; +} + + +/* + * This function allocates physical space for pages that are larger than the + * buddy allocator can handle. We want to allocate these in highmem because + * the amount of lowmem is limited. This means that this function MUST be + * called before lowmem_end_addr is set up in MMU_init() in order for the lmb + * allocate to grab highmem. + */ +void __init reserve_hugetlb_gpages(void) +{ + static __initdata char cmdline[COMMAND_LINE_SIZE]; + phys_addr_t size, base; + int i; + + strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup); + + /* + * Walk gpage list in reverse, allocating larger page sizes first. + * Skip over unsupported sizes, or sizes that have 0 gpages allocated. + * When we reach the point in the list where pages are no longer + * considered gpages, we're done. + */ + for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { + if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) + continue; + else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) + break; + + size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); + base = memblock_alloc_base(size * gpage_npages[i], size, + MEMBLOCK_ALLOC_ANYWHERE); + add_gpage(base, size, gpage_npages[i]); + } +} + +#else /* PPC64 */ + +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy or bootmem allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) { if (!addr) return; @@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate) m->hstate = hstate; return 1; } +#endif int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } +#ifdef CONFIG_PPC32 +#define HUGEPD_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) + +struct hugepd_freelist { + struct rcu_head rcu; + unsigned int index; + void *ptes[0]; +}; + +static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); + +static void hugepd_free_rcu_callback(struct rcu_head *head) +{ + struct hugepd_freelist *batch = + container_of(head, struct hugepd_freelist, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + kmem_cache_free(hugepte_cache, batch->ptes[i]); + + free_page((unsigned long)batch); +} + +static void hugepd_free(struct mmu_gather *tlb, void *hugepte) +{ + struct hugepd_freelist **batchp; + + batchp = &__get_cpu_var(hugepd_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpumask_equal(mm_cpumask(tlb->mm), + cpumask_of(smp_processor_id()))) { + kmem_cache_free(hugepte_cache, hugepte); + return; + } + + if (*batchp == NULL) { + *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); + (*batchp)->index = 0; + } + + (*batchp)->ptes[(*batchp)->index++] = hugepte; + if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { + call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); + *batchp = NULL; + } +} +#endif + static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, unsigned long start, unsigned long end, unsigned long floor, unsigned long ceiling) { pte_t *hugepte = hugepd_page(*hpdp); - unsigned shift = hugepd_shift(*hpdp); + int i; + unsigned long pdmask = ~((1UL << pdshift) - 1); + unsigned int num_hugepd = 1; + +#ifdef CONFIG_PPC64 + unsigned int shift = hugepd_shift(*hpdp); +#else + /* Note: On 32-bit the hpdp may be the first of several */ + num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); +#endif start &= pdmask; if (start < floor) @@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif if (end - 1 > ceiling - 1) return; - hpdp->pd = 0; + for (i = 0; i < num_hugepd; i++, hpdp++) + hpdp->pd = 0; + tlb->need_flush = 1; +#ifdef CONFIG_PPC64 pgtable_free_tlb(tlb, hugepte, pdshift - shift); +#else + hugepd_free(tlb, hugepte); +#endif } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, * too. */ - pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); + pgd = pgd_offset(tlb->mm, addr); if (!is_hugepd(pgd)) { if (pgd_none_or_clear_bad(pgd)) continue; hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); } else { +#ifdef CONFIG_PPC32 + /* + * Increment next by the size of the huge mapping since + * on 32-bit there may be more than one entry at the pgd + * level for a single hugepage, but all of them point to + * the same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); +#endif free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, addr, next, floor, ceiling); } - } while (pgd++, addr = next, addr != end); + } while (addr = next, addr != end); } struct page * @@ -466,17 +690,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { +#ifdef CONFIG_MM_SLICES struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); +#else + return get_unmapped_area(file, addr, len, pgoff, flags); +#endif } unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { +#ifdef CONFIG_MM_SLICES unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); return 1UL << mmu_psize_to_shift(psize); +#else + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + return huge_page_size(hstate_vma(vma)); +#endif +} + +static inline bool is_power_of_4(unsigned long x) +{ + if (is_power_of_2(x)) + return (__ilog2(x) % 2) ? false : true; + return false; } static int __init add_huge_page_size(unsigned long long size) @@ -486,9 +728,14 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if ((size < PAGE_SIZE) || !is_power_of_4(size)) + return -EINVAL; +#else if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) return -EINVAL; +#endif if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) return -EINVAL; @@ -525,6 +772,46 @@ static int __init hugepage_setup_sz(char *str) } __setup("hugepagesz=", hugepage_setup_sz); +#ifdef CONFIG_FSL_BOOKE +struct kmem_cache *hugepte_cache; +static int __init hugetlbpage_init(void) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + + if (!mmu_psize_defs[psize].shift) + continue; + + shift = mmu_psize_to_shift(psize); + + /* Don't treat normal page sizes as huge... */ + if (shift != PAGE_SHIFT) + if (add_huge_page_size(1ULL << shift) < 0) + continue; + } + + /* + * Create a kmem cache for hugeptes. The bottom bits in the pte have + * size information encoded in them, so align them to allow this + */ + hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t), + HUGEPD_SHIFT_MASK + 1, 0, NULL); + if (hugepte_cache == NULL) + panic("%s: Unable to create kmem cache for hugeptes\n", + __func__); + + /* Default hpage size = 4M */ + if (mmu_psize_defs[MMU_PAGE_4M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; + else + panic("%s: Unable to set default huge page size\n", __func__); + + + return 0; +} +#else static int __init hugetlbpage_init(void) { int psize; @@ -567,15 +854,23 @@ static int __init hugetlbpage_init(void) return 0; } - +#endif module_init(hugetlbpage_init); void flush_dcache_icache_hugepage(struct page *page) { int i; + void *start; BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) - __flush_dcache_icache(page_address(page+i)); + for (i = 0; i < (1UL << compound_order(page)); i++) { + if (!PageHighMem(page)) { + __flush_dcache_icache(page_address(page+i)); + } else { + start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE); + __flush_dcache_icache(start); + kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + } + } } diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index c77fef56dad..161cefde5c1 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,7 @@ #include #include #include +#include #include "mmu_decl.h" @@ -123,6 +126,12 @@ void __init MMU_init(void) /* parse args from command line */ MMU_setup(); + /* + * Reserve gigantic pages for hugetlb. This MUST occur before + * lowmem_end_addr is initialized below. + */ + reserve_hugetlb_gpages(); + if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII memblock.memory.cnt = 1; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index c781bbcf733..ad9cf49dfb8 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -548,4 +548,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, return; hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ +#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ + && defined(CONFIG_HUGETLB_PAGE) + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma->vm_mm, address, *ptep); +#endif } diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 336807de550..5b63bd3da4a 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -292,6 +292,11 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; +#ifdef CONFIG_PPC_MM_SLICES + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); +#endif + return 0; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index af40c8768a7..214130a4edc 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -212,7 +213,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { - if (!(vma->vm_flags & VM_HUGETLB)) + if (!is_vm_hugetlb_page(vma)) assert_pte_locked(vma->vm_mm, address); __ptep_set_access_flags(ptep, entry); flush_tlb_page_nohash(vma, address); diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 4ebb34bc01d..dc4a5f385e4 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -553,24 +553,24 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault #ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault #endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault /* Ok, we're all right, we can now create a kernel translation for * a 4K or 64K page from r16 -> r15. @@ -802,24 +802,24 @@ htw_tlb_miss: rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault #ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault #endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault /* Ok, we're all right, we can now create an indirect entry for * a 1M or 256M page. diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d32ec643c23..afc95c7304a 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -36,14 +36,49 @@ #include #include #include +#include #include #include #include +#include #include "mmu_decl.h" -#ifdef CONFIG_PPC_BOOK3E +/* + * This struct lists the sw-supported page sizes. The hardawre MMU may support + * other sizes not listed here. The .ind field is only used on MMUs that have + * indirect page table entries. + */ +#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_FSL_BOOKE +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_4M] = { + .shift = 22, + .enc = BOOK3E_PAGESZ_4M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_64M] = { + .shift = 26, + .enc = BOOK3E_PAGESZ_64M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#else struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -77,6 +112,8 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .enc = BOOK3E_PAGESZ_1GB, }, }; +#endif /* CONFIG_FSL_BOOKE */ + static inline int mmu_get_tsize(int psize) { return mmu_psize_defs[psize].enc; @@ -87,7 +124,7 @@ static inline int mmu_get_tsize(int psize) /* This isn't used on !Book3E for now */ return 0; } -#endif +#endif /* CONFIG_PPC_BOOK3E_MMU */ /* The variables below are currently only used on 64-bit Book3E * though this will probably be made common with other nohash @@ -266,6 +303,11 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + flush_hugetlb_page(vma, vmaddr); +#endif + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, mmu_get_tsize(mmu_virtual_psize), 0); } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index e06e39589a0..a85990c886e 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -69,6 +69,7 @@ config PPC_BOOK3S_64 bool "Server processors" select PPC_FPU select PPC_HAVE_PMU_SUPPORT + select SYS_SUPPORTS_HUGETLBFS config PPC_BOOK3E_64 bool "Embedded processors" @@ -173,6 +174,7 @@ config BOOKE config FSL_BOOKE bool depends on (E200 || E500) && PPC32 + select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT default y # this is for common code between PPC32 & PPC64 FSL BOOKE @@ -296,7 +298,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if HUGETLB_PAGE || (PPC_STD_MMU_64 && PPC_64K_PAGES) + default y if (PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) default n config VIRT_CPU_ACCOUNTING -- cgit v1.2.3-70-g09d2 From 14b9247019432fc25e606b78262eb16a4a33b8ed Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Fri, 8 Jul 2011 11:12:42 +0000 Subject: powerpc/mpic: Add support for discontiguous cores There is one place in the MPIC driver that assumes that the cores are numbered from 0 to n-1. However, this is not true if the CPUs are not numbered sequentially. This can happen on a eight-core SOC where cores two and three are removed in the device tree. So instead of blindly looping, we iterate over the discovered CPUs and use the SMP ID as the index. This means that we no longer ask the MPIC how many CPUs there are, so we also delete mpic->num_cpus. We also catch if the number of CPUs in the SOC exceeds the number that the MPIC supports. This should never happen, of course, but it's good to be sure. Signed-off-by: Timur Tabi Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mpic.h | 2 -- arch/powerpc/sysdev/mpic.c | 24 +++++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index df18989e78d..e6fae49e0b7 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -273,8 +273,6 @@ struct mpic unsigned int irq_count; /* Number of sources */ unsigned int num_sources; - /* Number of CPUs */ - unsigned int num_cpus; /* default senses array */ unsigned char *senses; unsigned int senses_count; diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index d5d3ff3d757..9678081dc4e 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1285,13 +1285,11 @@ struct mpic * __init mpic_alloc(struct device_node *node, mpic_read(mpic->gregs, MPIC_INFO(GREG_GLOBAL_CONF_0)) | MPIC_GREG_GCONF_MCK); - /* Read feature register, calculate num CPUs and, for non-ISU - * MPICs, num sources as well. On ISU MPICs, sources are counted - * as ISUs are added + /* + * Read feature register. For non-ISU MPICs, num sources as well. On + * ISU MPICs, sources are counted as ISUs are added */ greg_feature = mpic_read(mpic->gregs, MPIC_INFO(GREG_FEATURE_0)); - mpic->num_cpus = ((greg_feature & MPIC_GREG_FEATURE_LAST_CPU_MASK) - >> MPIC_GREG_FEATURE_LAST_CPU_SHIFT) + 1; if (isu_size == 0) { if (flags & MPIC_BROKEN_FRR_NIRQS) mpic->num_sources = mpic->irq_count; @@ -1301,10 +1299,18 @@ struct mpic * __init mpic_alloc(struct device_node *node, >> MPIC_GREG_FEATURE_LAST_SRC_SHIFT) + 1; } + /* + * The MPIC driver will crash if there are more cores than we + * can initialize, so we may as well catch that problem here. + */ + BUG_ON(num_possible_cpus() > MPIC_MAX_CPUS); + /* Map the per-CPU registers */ - for (i = 0; i < mpic->num_cpus; i++) { - mpic_map(mpic, node, paddr, &mpic->cpuregs[i], - MPIC_INFO(CPU_BASE) + i * MPIC_INFO(CPU_STRIDE), + for_each_possible_cpu(i) { + unsigned int cpu = get_hard_smp_processor_id(i); + + mpic_map(mpic, node, paddr, &mpic->cpuregs[cpu], + MPIC_INFO(CPU_BASE) + cpu * MPIC_INFO(CPU_STRIDE), 0x1000); } @@ -1343,7 +1349,7 @@ struct mpic * __init mpic_alloc(struct device_node *node, } printk(KERN_INFO "mpic: Setting up MPIC \"%s\" version %s at %llx," " max %d CPUs\n", - name, vers, (unsigned long long)paddr, mpic->num_cpus); + name, vers, (unsigned long long)paddr, num_possible_cpus()); printk(KERN_INFO "mpic: ISU size: %d, shift: %d, mask: %x\n", mpic->isu_size, mpic->isu_shift, mpic->isu_mask); -- cgit v1.2.3-70-g09d2 From 6c493685f1b209dd4ae41eb52c818cf12da20def Mon Sep 17 00:00:00 2001 From: Jim Keniston Date: Mon, 25 Jul 2011 07:54:50 +0000 Subject: powerpc/nvram: Add compression to fit more oops output into NVRAM Capture more than twice as much text from the printk buffer, and compress it to fit it in the lnx,oops-log NVRAM partition. You can view the compressed text using the new (as of July 20) --unzip option of the nvram command in the powerpc-utils package. [BenH: Added select of ZLIB_DEFLATE] Signed-off-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/rtas.h | 6 +- arch/powerpc/platforms/pseries/Kconfig | 1 + arch/powerpc/platforms/pseries/nvram.c | 171 +++++++++++++++++++++++++++++++-- 3 files changed, 169 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 58625d1e780..41f69ae79d4 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -249,10 +249,12 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #define ERR_FLAG_ALREADY_LOGGED 0x0 #define ERR_FLAG_BOOT 0x1 /* log was pulled from NVRAM on boot */ #define ERR_TYPE_RTAS_LOG 0x2 /* from rtas event-scan */ -#define ERR_TYPE_KERNEL_PANIC 0x4 /* from panic() */ +#define ERR_TYPE_KERNEL_PANIC 0x4 /* from die()/panic() */ +#define ERR_TYPE_KERNEL_PANIC_GZ 0x8 /* ditto, compressed */ /* All the types and not flags */ -#define ERR_TYPE_MASK (ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC) +#define ERR_TYPE_MASK \ + (ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC | ERR_TYPE_KERNEL_PANIC_GZ) #define RTAS_DEBUG KERN_DEBUG "RTAS: " diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 05cf4769b88..c81f6bb9c10 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -15,6 +15,7 @@ config PPC_PSERIES select PPC_UDBG_16550 select PPC_NATIVE select PPC_PCI_CHOICE if EXPERT + select ZLIB_DEFLATE default y config PPC_SPLPAR diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 00cc3a09488..a76b22844d1 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -78,8 +80,41 @@ static struct kmsg_dumper nvram_kmsg_dumper = { #define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ static unsigned long last_unread_rtas_event; /* timestamp */ -/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */ -static char *oops_buf; +/* + * For capturing and compressing an oops or panic report... + + * big_oops_buf[] holds the uncompressed text we're capturing. + * + * oops_buf[] holds the compressed text, preceded by a prefix. + * The prefix is just a u16 holding the length of the compressed* text. + * (*Or uncompressed, if compression fails.) oops_buf[] gets written + * to NVRAM. + * + * oops_len points to the prefix. oops_data points to the compressed text. + * + * +- oops_buf + * | +- oops_data + * v v + * +------------+-----------------------------------------------+ + * | length | text | + * | (2 bytes) | (oops_data_sz bytes) | + * +------------+-----------------------------------------------+ + * ^ + * +- oops_len + * + * We preallocate these buffers during init to avoid kmalloc during oops/panic. + */ +static size_t big_oops_buf_sz; +static char *big_oops_buf, *oops_buf; +static u16 *oops_len; +static char *oops_data; +static size_t oops_data_sz; + +/* Compression parameters */ +#define COMPR_LEVEL 6 +#define WINDOW_BITS 12 +#define MEM_LEVEL 4 +static struct z_stream_s stream; static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { @@ -387,11 +422,44 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists) sizeof(rtas_log_partition)); } oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL); + if (!oops_buf) { + pr_err("nvram: No memory for %s partition\n", + oops_log_partition.name); + return; + } + oops_len = (u16*) oops_buf; + oops_data = oops_buf + sizeof(u16); + oops_data_sz = oops_log_partition.size - sizeof(u16); + + /* + * Figure compression (preceded by elimination of each line's + * severity prefix) will reduce the oops/panic report to at most + * 45% of its original size. + */ + big_oops_buf_sz = (oops_data_sz * 100) / 45; + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + stream.workspace = kmalloc(zlib_deflate_workspacesize( + WINDOW_BITS, MEM_LEVEL), GFP_KERNEL); + if (!stream.workspace) { + pr_err("nvram: No memory for compression workspace; " + "skipping compression of %s partition data\n", + oops_log_partition.name); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed %s data; " + "skipping compression\n", oops_log_partition.name); + stream.workspace = NULL; + } + rc = kmsg_dump_register(&nvram_kmsg_dumper); if (rc != 0) { pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); kfree(oops_buf); - return; + kfree(big_oops_buf); + kfree(stream.workspace); } } @@ -473,7 +541,83 @@ static int clobbering_unread_rtas_event(void) NVRAM_RTAS_READ_TIMEOUT); } -/* our kmsg_dump callback */ +/* Squeeze out each line's severity prefix. */ +static size_t elide_severities(char *buf, size_t len) +{ + char *in, *out, *buf_end = buf + len; + /* Assume a at the very beginning marks the start of a line. */ + int newline = 1; + + in = out = buf; + while (in < buf_end) { + if (newline && in+3 <= buf_end && + *in == '<' && isdigit(in[1]) && in[2] == '>') { + in += 3; + newline = 0; + } else { + newline = (*in == '\n'); + *out++ = *in++; + } + } + return out - buf; +} + +/* Derived from logfs_compress() */ +static int nvram_compress(const void *in, void *out, size_t inlen, + size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, + MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +/* Compress the text from big_oops_buf into oops_buf. */ +static int zip_oops(size_t text_len) +{ + int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len, + oops_data_sz); + if (zipped_len < 0) { + pr_err("nvram: compression failed; returned %d\n", zipped_len); + pr_err("nvram: logging uncompressed oops/panic report\n"); + return -1; + } + *oops_len = (u16) zipped_len; + return 0; +} + +/* + * This is our kmsg_dump callback, called after an oops or panic report + * has been written to the printk buffer. We want to capture as much + * of the printk buffer as possible. First, capture as much as we can + * that we think will compress sufficiently to fit in the lnx,oops-log + * partition. If that's too much, go back and capture uncompressed text. + */ static void oops_to_nvram(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, const char *old_msgs, unsigned long old_len, @@ -482,6 +626,8 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, static unsigned int oops_count = 0; static bool panicking = false; size_t text_len; + unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ; + int rc = -1; switch (reason) { case KMSG_DUMP_RESTART: @@ -509,8 +655,19 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, if (clobbering_unread_rtas_event()) return; - text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len, - oops_buf, oops_log_partition.size); + if (big_oops_buf) { + text_len = capture_last_msgs(old_msgs, old_len, + new_msgs, new_len, big_oops_buf, big_oops_buf_sz); + text_len = elide_severities(big_oops_buf, text_len); + rc = zip_oops(text_len); + } + if (rc != 0) { + text_len = capture_last_msgs(old_msgs, old_len, + new_msgs, new_len, oops_data, oops_data_sz); + err_type = ERR_TYPE_KERNEL_PANIC; + *oops_len = (u16) text_len; + } + (void) nvram_write_os_partition(&oops_log_partition, oops_buf, - (int) text_len, ERR_TYPE_KERNEL_PANIC, ++oops_count); + (int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count); } -- cgit v1.2.3-70-g09d2 From c26afe9e8591f306d79aab8071f1d34e4f60b700 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Wed, 31 Aug 2011 06:32:26 +0000 Subject: powerpc/ps3: Add gelic udbg driver Add a new udbg driver for the PS3 gelic Ehthernet device. This driver shares only a few stucture and constant definitions with the gelic Ethernet device driver, so is implemented as a stand-alone driver with no dependencies on the gelic Ethernet device driver. Signed-off-by: Hector Martin Signed-off-by: Andre Heider Signed-off-by: Geoff Levand Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig.debug | 8 + arch/powerpc/include/asm/udbg.h | 1 + arch/powerpc/kernel/udbg.c | 2 + arch/powerpc/platforms/ps3/Kconfig | 12 ++ arch/powerpc/platforms/ps3/Makefile | 1 + arch/powerpc/platforms/ps3/gelic_udbg.c | 273 ++++++++++++++++++++++++++++++++ drivers/net/ps3_gelic_net.c | 3 + drivers/net/ps3_gelic_net.h | 6 + 8 files changed, 306 insertions(+) create mode 100644 arch/powerpc/platforms/ps3/gelic_udbg.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 067cb848074..ab2335f160a 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -258,6 +258,14 @@ config PPC_EARLY_DEBUG_WSP depends on PPC_WSP select PPC_UDBG_16550 +config PPC_EARLY_DEBUG_PS3GELIC + bool "Early debugging through the PS3 Ethernet port" + depends on PPC_PS3 + select PS3GELIC_UDBG + help + Select this to enable early debugging for the PlayStation3 via + UDP broadcasts sent out through the Ethernet port. + endchoice config PPC_EARLY_DEBUG_HVSI_VTERMNO diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 93e05d1b34b..7cf796fa03f 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -54,6 +54,7 @@ extern void __init udbg_init_40x_realmode(void); extern void __init udbg_init_cpm(void); extern void __init udbg_init_usbgecko(void); extern void __init udbg_init_wsp(void); +extern void __init udbg_init_ps3gelic(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_UDBG_H */ diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index faa82c1f3f6..5b3e98e0315 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -67,6 +67,8 @@ void __init udbg_early_init(void) udbg_init_usbgecko(); #elif defined(CONFIG_PPC_EARLY_DEBUG_WSP) udbg_init_wsp(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC) + udbg_init_ps3gelic(); #endif #ifdef CONFIG_PPC_EARLY_DEBUG diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig index dfe316b161a..476d9d9b240 100644 --- a/arch/powerpc/platforms/ps3/Kconfig +++ b/arch/powerpc/platforms/ps3/Kconfig @@ -148,4 +148,16 @@ config PS3_LPM profiling support of the Cell processor with programs like oprofile and perfmon2, then say Y or M, otherwise say N. +config PS3GELIC_UDBG + bool "PS3 udbg output via UDP broadcasts on Ethernet" + depends on PPC_PS3 + help + Enables udbg early debugging output by sending broadcast UDP + via the Ethernet port (UDP port number 18194). + + This driver uses a trivial implementation and is independent + from the main network driver. + + If in doubt, say N here. + endmenu diff --git a/arch/powerpc/platforms/ps3/Makefile b/arch/powerpc/platforms/ps3/Makefile index ac1bdf844ec..02b9e636dab 100644 --- a/arch/powerpc/platforms/ps3/Makefile +++ b/arch/powerpc/platforms/ps3/Makefile @@ -2,6 +2,7 @@ obj-y += setup.o mm.o time.o hvcall.o htab.o repository.o obj-y += interrupt.o exports.o os-area.o obj-y += system-bus.o +obj-$(CONFIG_PS3GELIC_UDBG) += gelic_udbg.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SPU_BASE) += spu.o obj-y += device-init.o diff --git a/arch/powerpc/platforms/ps3/gelic_udbg.c b/arch/powerpc/platforms/ps3/gelic_udbg.c new file mode 100644 index 00000000000..20b46a19a48 --- /dev/null +++ b/arch/powerpc/platforms/ps3/gelic_udbg.c @@ -0,0 +1,273 @@ +/* + * udbg debug output routine via GELIC UDP broadcasts + * + * Copyright (C) 2007 Sony Computer Entertainment Inc. + * Copyright 2006, 2007 Sony Corporation + * Copyright (C) 2010 Hector Martin + * Copyright (C) 2011 Andre Heider + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + */ + +#include +#include +#include + +#define GELIC_BUS_ID 1 +#define GELIC_DEVICE_ID 0 +#define GELIC_DEBUG_PORT 18194 +#define GELIC_MAX_MESSAGE_SIZE 1000 + +#define GELIC_LV1_GET_MAC_ADDRESS 1 +#define GELIC_LV1_GET_VLAN_ID 4 +#define GELIC_LV1_VLAN_TX_ETHERNET_0 2 + +#define GELIC_DESCR_DMA_STAT_MASK 0xf0000000 +#define GELIC_DESCR_DMA_CARDOWNED 0xa0000000 + +#define GELIC_DESCR_TX_DMA_IKE 0x00080000 +#define GELIC_DESCR_TX_DMA_NO_CHKSUM 0x00000000 +#define GELIC_DESCR_TX_DMA_FRAME_TAIL 0x00040000 + +#define GELIC_DESCR_DMA_CMD_NO_CHKSUM (GELIC_DESCR_DMA_CARDOWNED | \ + GELIC_DESCR_TX_DMA_IKE | \ + GELIC_DESCR_TX_DMA_NO_CHKSUM) + +static u64 bus_addr; + +struct gelic_descr { + /* as defined by the hardware */ + __be32 buf_addr; + __be32 buf_size; + __be32 next_descr_addr; + __be32 dmac_cmd_status; + __be32 result_size; + __be32 valid_size; /* all zeroes for tx */ + __be32 data_status; + __be32 data_error; /* all zeroes for tx */ +} __attribute__((aligned(32))); + +struct debug_block { + struct gelic_descr descr; + u8 pkt[1520]; +} __packed; + +struct ethhdr { + u8 dest[6]; + u8 src[6]; + u16 type; +} __packed; + +struct vlantag { + u16 vlan; + u16 subtype; +} __packed; + +struct iphdr { + u8 ver_len; + u8 dscp_ecn; + u16 total_length; + u16 ident; + u16 frag_off_flags; + u8 ttl; + u8 proto; + u16 checksum; + u32 src; + u32 dest; +} __packed; + +struct udphdr { + u16 src; + u16 dest; + u16 len; + u16 checksum; +} __packed; + +static __iomem struct ethhdr *h_eth; +static __iomem struct vlantag *h_vlan; +static __iomem struct iphdr *h_ip; +static __iomem struct udphdr *h_udp; + +static __iomem char *pmsg; +static __iomem char *pmsgc; + +static __iomem struct debug_block dbg __attribute__((aligned(32))); + +static int header_size; + +static void map_dma_mem(int bus_id, int dev_id, void *start, size_t len, + u64 *real_bus_addr) +{ + s64 result; + u64 real_addr = ((u64)start) & 0x0fffffffffffffffUL; + u64 real_end = real_addr + len; + u64 map_start = real_addr & ~0xfff; + u64 map_end = (real_end + 0xfff) & ~0xfff; + u64 bus_addr = 0; + + u64 flags = 0xf800000000000000UL; + + result = lv1_allocate_device_dma_region(bus_id, dev_id, + map_end - map_start, 12, 0, + &bus_addr); + if (result) + lv1_panic(0); + + result = lv1_map_device_dma_region(bus_id, dev_id, map_start, + bus_addr, map_end - map_start, + flags); + if (result) + lv1_panic(0); + + *real_bus_addr = bus_addr + real_addr - map_start; +} + +static int unmap_dma_mem(int bus_id, int dev_id, u64 bus_addr, size_t len) +{ + s64 result; + u64 real_bus_addr; + + real_bus_addr = bus_addr & ~0xfff; + len += bus_addr - real_bus_addr; + len = (len + 0xfff) & ~0xfff; + + result = lv1_unmap_device_dma_region(bus_id, dev_id, real_bus_addr, + len); + if (result) + return result; + + return lv1_free_device_dma_region(bus_id, dev_id, real_bus_addr); +} + +static void gelic_debug_init(void) +{ + s64 result; + u64 v2; + u64 mac; + u64 vlan_id; + + result = lv1_open_device(GELIC_BUS_ID, GELIC_DEVICE_ID, 0); + if (result) + lv1_panic(0); + + map_dma_mem(GELIC_BUS_ID, GELIC_DEVICE_ID, &dbg, sizeof(dbg), + &bus_addr); + + memset(&dbg, 0, sizeof(dbg)); + + dbg.descr.buf_addr = bus_addr + offsetof(struct debug_block, pkt); + + wmb(); + + result = lv1_net_control(GELIC_BUS_ID, GELIC_DEVICE_ID, + GELIC_LV1_GET_MAC_ADDRESS, 0, 0, 0, + &mac, &v2); + if (result) + lv1_panic(0); + + mac <<= 16; + + h_eth = (struct ethhdr *)dbg.pkt; + + memset(&h_eth->dest, 0xff, 6); + memcpy(&h_eth->src, &mac, 6); + + header_size = sizeof(struct ethhdr); + + result = lv1_net_control(GELIC_BUS_ID, GELIC_DEVICE_ID, + GELIC_LV1_GET_VLAN_ID, + GELIC_LV1_VLAN_TX_ETHERNET_0, 0, 0, + &vlan_id, &v2); + if (!result) { + h_eth->type = 0x8100; + + header_size += sizeof(struct vlantag); + h_vlan = (struct vlantag *)(h_eth + 1); + h_vlan->vlan = vlan_id; + h_vlan->subtype = 0x0800; + h_ip = (struct iphdr *)(h_vlan + 1); + } else { + h_eth->type = 0x0800; + h_ip = (struct iphdr *)(h_eth + 1); + } + + header_size += sizeof(struct iphdr); + h_ip->ver_len = 0x45; + h_ip->ttl = 10; + h_ip->proto = 0x11; + h_ip->src = 0x00000000; + h_ip->dest = 0xffffffff; + + header_size += sizeof(struct udphdr); + h_udp = (struct udphdr *)(h_ip + 1); + h_udp->src = GELIC_DEBUG_PORT; + h_udp->dest = GELIC_DEBUG_PORT; + + pmsgc = pmsg = (char *)(h_udp + 1); +} + +static void gelic_debug_shutdown(void) +{ + if (bus_addr) + unmap_dma_mem(GELIC_BUS_ID, GELIC_DEVICE_ID, + bus_addr, sizeof(dbg)); + lv1_close_device(GELIC_BUS_ID, GELIC_DEVICE_ID); +} + +static void gelic_sendbuf(int msgsize) +{ + u16 *p; + u32 sum; + int i; + + dbg.descr.buf_size = header_size + msgsize; + h_ip->total_length = msgsize + sizeof(struct udphdr) + + sizeof(struct iphdr); + h_udp->len = msgsize + sizeof(struct udphdr); + + h_ip->checksum = 0; + sum = 0; + p = (u16 *)h_ip; + for (i = 0; i < 5; i++) + sum += *p++; + h_ip->checksum = ~(sum + (sum >> 16)); + + dbg.descr.dmac_cmd_status = GELIC_DESCR_DMA_CMD_NO_CHKSUM | + GELIC_DESCR_TX_DMA_FRAME_TAIL; + dbg.descr.result_size = 0; + dbg.descr.data_status = 0; + + wmb(); + + lv1_net_start_tx_dma(GELIC_BUS_ID, GELIC_DEVICE_ID, bus_addr, 0); + + while ((dbg.descr.dmac_cmd_status & GELIC_DESCR_DMA_STAT_MASK) == + GELIC_DESCR_DMA_CARDOWNED) + cpu_relax(); +} + +static void ps3gelic_udbg_putc(char ch) +{ + *pmsgc++ = ch; + if (ch == '\n' || (pmsgc-pmsg) >= GELIC_MAX_MESSAGE_SIZE) { + gelic_sendbuf(pmsgc-pmsg); + pmsgc = pmsg; + } +} + +void __init udbg_init_ps3gelic(void) +{ + gelic_debug_init(); + udbg_putc = ps3gelic_udbg_putc; +} + +void udbg_shutdown_ps3gelic(void) +{ + udbg_putc = NULL; + gelic_debug_shutdown(); +} +EXPORT_SYMBOL(udbg_shutdown_ps3gelic); diff --git a/drivers/net/ps3_gelic_net.c b/drivers/net/ps3_gelic_net.c index d82a82d9870..e743c9418ac 100644 --- a/drivers/net/ps3_gelic_net.c +++ b/drivers/net/ps3_gelic_net.c @@ -1674,6 +1674,9 @@ static int __devinit ps3_gelic_driver_probe(struct ps3_system_bus_device *dev) int result; pr_debug("%s: called\n", __func__); + + udbg_shutdown_ps3gelic(); + result = ps3_open_hv_device(dev); if (result) { diff --git a/drivers/net/ps3_gelic_net.h b/drivers/net/ps3_gelic_net.h index d3fadfbc3bc..a93df6ac190 100644 --- a/drivers/net/ps3_gelic_net.h +++ b/drivers/net/ps3_gelic_net.h @@ -359,6 +359,12 @@ static inline void *port_priv(struct gelic_port *port) return port->priv; } +#ifdef CONFIG_PPC_EARLY_DEBUG_PS3GELIC +extern void udbg_shutdown_ps3gelic(void); +#else +static inline void udbg_shutdown_ps3gelic(void) {} +#endif + extern int gelic_card_set_irq_mask(struct gelic_card *card, u64 mask); /* shared netdev ops */ extern void gelic_card_up(struct gelic_card *card); -- cgit v1.2.3-70-g09d2 From a200d8e44649de2cbb39de95f42ad4ef5dc8dc22 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 24 Jul 2011 16:33:12 +0000 Subject: powerpc/numa: Enable SD_WAKE_AFFINE in node definition When chasing a performance issue on ppc64, I noticed tasks communicating via a pipe would often end up on different nodes. It turns out SD_WAKE_AFFINE is not set in our node defition. Commit 9fcd18c9e63e (sched: re-tune balancing) enabled SD_WAKE_AFFINE in the node definition for x86 and we need a similar change for ppc64. I used lmbench lat_ctx and perf bench pipe to verify this fix. Each benchmark was run 10 times and the average taken. lmbench lat_ctx: before: 66565 ops/sec after: 204700 ops/sec 3.1x faster perf bench pipe: before: 5.6570 usecs after: 1.3470 usecs 4.2x faster Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 7ef0d90defc..6a7e7251cc0 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -73,7 +73,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 0*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ + | 1*SD_WAKE_AFFINE \ | 0*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ -- cgit v1.2.3-70-g09d2 From d4761ad2ef18ec2c9a0037d6649c0afc4a7b907d Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 24 Jul 2011 16:33:14 +0000 Subject: powerpc/numa: Increase SD_NODES_PER_DOMAIN to 32. The largest POWER7 boxes have 32 nodes. SD_NODES_PER_DOMAIN groups nodes into chunks of 16 and adds a global balancing domain (SD_ALLNODES) above it. If we bump SD_NODES_PER_DOMAIN to 32, then we avoid this extra level of balancing on our largest boxes. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 6a7e7251cc0..d1c1d312f80 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -28,6 +28,12 @@ struct device_node; */ #define RECLAIM_DISTANCE 10 +/* + * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest + * POWER7 boxes which have a maximum of 32 nodes. + */ +#define SD_NODES_PER_DOMAIN 32 + #include static inline int cpu_to_node(int cpu) -- cgit v1.2.3-70-g09d2 From 7bebcf0925f09224393a8992af706fa39aa10395 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 24 Jul 2011 16:33:15 +0000 Subject: powerpc/numa: Disable NEWIDLE balancing at node level On big POWER7 boxes we see large amounts of CPU time in system processes like workqueue and watchdog kernel threads. We currently rebalance the entire machine each time a task goes idle and this is very expensive on large machines. Disable newidle balancing at the node level and rely on the scheduler tick to rebalance across nodes. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index d1c1d312f80..13efa4d95e7 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -75,7 +75,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ + | 0*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 0*SD_BALANCE_WAKE \ -- cgit v1.2.3-70-g09d2 From e377bc5d49fdbcb5f0e559b644d806a15454d407 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 24 Jul 2011 16:33:16 +0000 Subject: powerpc/numa: Remove duplicate RECLAIM_DISTANCE definition We have two identical definitions of RECLAIM_DISTANCE, looks like the patch got applied twice. Remove one. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/topology.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 13efa4d95e7..1e104af0848 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -18,16 +18,6 @@ struct device_node; */ #define RECLAIM_DISTANCE 10 -/* - * Before going off node we want the VM to try and reclaim from the local - * node. It does this if the remote distance is larger than RECLAIM_DISTANCE. - * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of - * 20, we never reclaim and go off node straight away. - * - * To fix this we choose a smaller value of RECLAIM_DISTANCE. - */ -#define RECLAIM_DISTANCE 10 - /* * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest * POWER7 boxes which have a maximum of 32 nodes. -- cgit v1.2.3-70-g09d2 From a11940978bd598e65996b4f807cf4904793f7025 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 10 Aug 2011 20:44:24 +0000 Subject: powerpc: Fix oops when echoing bad values to /sys/devices/system/memory/probe If we echo an address the hypervisor doesn't like to /sys/devices/system/memory/probe we oops the box: # echo 0x10000000000 > /sys/devices/system/memory/probe kernel BUG at arch/powerpc/mm/hash_utils_64.c:541! The backtrace is: create_section_mapping arch_add_memory add_memory memory_probe_store sysdev_class_store sysfs_write_file vfs_write SyS_write In create_section_mapping we BUG if htab_bolt_mapping returned an error. A better approach is to return an error which will propagate back to userspace. Rerunning the test with this patch applied: # echo 0x10000000000 > /sys/devices/system/memory/probe -bash: echo: write error: Invalid argument Signed-off-by: Anton Blanchard Cc: stable@kernel.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/sparsemem.h | 2 +- arch/powerpc/mm/hash_utils_64.c | 6 +++--- arch/powerpc/mm/mem.c | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 54a47ea2c3a..0c5fa314561 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -16,7 +16,7 @@ #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_MEMORY_HOTPLUG -extern void create_section_mapping(unsigned long start, unsigned long end); +extern int create_section_mapping(unsigned long start, unsigned long end); extern int remove_section_mapping(unsigned long start, unsigned long end); #ifdef CONFIG_NUMA extern int hot_add_scn_to_nid(unsigned long scn_addr); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 1f8b2a05e3d..1628201c8ce 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -531,11 +531,11 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -void create_section_mapping(unsigned long start, unsigned long end) +int create_section_mapping(unsigned long start, unsigned long end) { - BUG_ON(htab_bolt_mapping(start, end, __pa(start), + return htab_bolt_mapping(start, end, __pa(start), pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize)); + mmu_kernel_ssize); } int remove_section_mapping(unsigned long start, unsigned long end) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ad9cf49dfb8..5db316cad47 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -123,7 +123,8 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); - create_section_mapping(start, start + size); + if (create_section_mapping(start, start + size)) + return -EINVAL; /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; -- cgit v1.2.3-70-g09d2 From fb82b83970a32263698e54a8779d2ce88cd3b060 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:49 +0000 Subject: powerpc/smp: More generic support for "soft hotplug" This adds more generic support for doing CPU hotplug with a simple idle loop and no actual reset of the processors. The generic smp_generic_kick_cpu() does the hotplug bringup trick if the PACA shows that the CPU has already been started at boot and we provide an accessor for the CPU state. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/smp.h | 1 + arch/powerpc/kernel/smp.c | 30 +++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 15a70b7f638..adba970ce91 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -65,6 +65,7 @@ int generic_cpu_disable(void); void generic_cpu_die(unsigned int cpu); void generic_mach_cpu_die(void); void generic_set_cpu_dead(unsigned int cpu); +int generic_check_cpu_restart(unsigned int cpu); #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 7bf2187dfd9..af7e7722eca 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -70,6 +70,10 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) + +/* State of each CPU during hotplug phases */ +static DEFINE_PER_CPU(int, cpu_state) = { 0 }; + #else static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) @@ -104,12 +108,25 @@ int __devinit smp_generic_kick_cpu(int nr) * cpu_start field to become non-zero After we set cpu_start, * the processor will continue on to secondary_start */ - paca[nr].cpu_start = 1; - smp_mb(); + if (!paca[nr].cpu_start) { + paca[nr].cpu_start = 1; + smp_mb(); + return 0; + } + +#ifdef CONFIG_HOTPLUG_CPU + /* + * Ok it's not there, so it might be soft-unplugged, let's + * try to bring it back + */ + per_cpu(cpu_state, nr) = CPU_UP_PREPARE; + smp_wmb(); + smp_send_reschedule(nr); +#endif /* CONFIG_HOTPLUG_CPU */ return 0; } -#endif +#endif /* CONFIG_PPC64 */ static irqreturn_t call_function_action(int irq, void *data) { @@ -357,8 +374,6 @@ void __devinit smp_prepare_boot_cpu(void) } #ifdef CONFIG_HOTPLUG_CPU -/* State of each CPU during hotplug phases */ -static DEFINE_PER_CPU(int, cpu_state) = { 0 }; int generic_cpu_disable(void) { @@ -406,6 +421,11 @@ void generic_set_cpu_dead(unsigned int cpu) { per_cpu(cpu_state, cpu) = CPU_DEAD; } + +int generic_check_cpu_restart(unsigned int cpu) +{ + return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; +} #endif struct create_idle { -- cgit v1.2.3-70-g09d2 From 27f4488872d9ef2a4b9aa2be58fb0789d6c0ba84 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 18:27:58 +0000 Subject: powerpc/powernv: Add OPAL takeover from PowerVM On machines supporting the OPAL firmware version 1, the system is initially booted under pHyp. We then use a special hypercall to verify if OPAL is available and if it is, we then trigger a "takeover" which disables pHyp and loads the OPAL runtime firmware, giving control to the kernel in hypervisor mode. This patch add the necessary code to detect that the OPAL takeover capability is present when running under PowerVM (aka pHyp) and perform said takeover to get hypervisor control of the processor. To perform the takeover, we must first use RTAS (within Open Firmware runtime environment) to start all processors & threads, in order to give control to OPAL on all of them. We then call the takeover hypercall on everybody, OPAL will re-enter the kernel main entry point passing it a flat device-tree. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/opal.h | 50 ++++++ arch/powerpc/kernel/head_64.S | 4 + arch/powerpc/kernel/prom_init.c | 239 +++++++++++++++++++++++-- arch/powerpc/kernel/prom_init_check.sh | 3 +- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/opal-takeover.S | 140 +++++++++++++++ 6 files changed, 419 insertions(+), 19 deletions(-) create mode 100644 arch/powerpc/include/asm/opal.h create mode 100644 arch/powerpc/platforms/powernv/opal-takeover.S (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h new file mode 100644 index 00000000000..ecdb283f8b7 --- /dev/null +++ b/arch/powerpc/include/asm/opal.h @@ -0,0 +1,50 @@ +/* + * PowerNV OPAL definitions. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __OPAL_H +#define __OPAL_H + +/****** Takeover interface ********/ + +/* PAPR H-Call used to querty the HAL existence and/or instanciate + * it from within pHyp (tech preview only). + * + * This is exclusively used in prom_init.c + */ + +#ifndef __ASSEMBLY__ + +struct opal_takeover_args { + u64 k_image; /* r4 */ + u64 k_size; /* r5 */ + u64 k_entry; /* r6 */ + u64 k_entry2; /* r7 */ + u64 hal_addr; /* r8 */ + u64 rd_image; /* r9 */ + u64 rd_size; /* r10 */ + u64 rd_loc; /* r11 */ +}; + +extern long opal_query_takeover(u64 *hal_size, u64 *hal_align); + +extern long opal_do_takeover(struct opal_takeover_args *args); + +extern int opal_enter_rtas(struct rtas_args *args, + unsigned long data, + unsigned long entry); + + +#endif /* __ASSEMBLY__ */ + +/****** OPAL APIs ******/ + + +#endif /* __OPAL_H */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index e708abe576d..dea8191253d 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -51,6 +51,10 @@ * For pSeries or server processors: * 1. The MMU is off & open firmware is running in real mode. * 2. The kernel is entered at __start + * -or- For OPAL entry: + * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 + * with device-tree in gpr3 + * 2. Secondary processors enter at 0x60 with PIR in gpr3 * * For iSeries: * 1. The MMU is on (as it always is for iSeries) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a909f4e9343..9369287aa8c 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -43,6 +43,7 @@ #include #include #include +#include #include @@ -185,6 +186,7 @@ static unsigned long __initdata prom_tce_alloc_end; #define PLATFORM_LPAR 0x0001 #define PLATFORM_POWERMAC 0x0400 #define PLATFORM_GENERIC 0x0500 +#define PLATFORM_OPAL 0x0600 static int __initdata of_platform; @@ -644,7 +646,7 @@ static void __init early_cmdline_parse(void) } } -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * There are two methods for telling firmware what our capabilities are. * Newer machines have an "ibm,client-architecture-support" method on the @@ -1274,6 +1276,195 @@ static void __init prom_init_mem(void) prom_printf(" ram_top : %x\n", RELOC(ram_top)); } +static void __init prom_close_stdin(void) +{ + struct prom_t *_prom = &RELOC(prom); + ihandle val; + + if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0) + call_prom("close", 1, 0, val); +} + +#ifdef CONFIG_PPC_POWERNV + +static u64 __initdata prom_opal_size; +static u64 __initdata prom_opal_align; +static int __initdata prom_rtas_start_cpu; +static u64 __initdata prom_rtas_data; +static u64 __initdata prom_rtas_entry; + +/* XXX Don't change this structure without updating opal-takeover.S */ +static struct opal_secondary_data { + s64 ack; /* 0 */ + u64 go; /* 8 */ + struct opal_takeover_args args; /* 16 */ +} opal_secondary_data; + +extern char opal_secondary_entry; + +static void prom_query_opal(void) +{ + long rc; + + prom_printf("Querying for OPAL presence... "); + rc = opal_query_takeover(&RELOC(prom_opal_size), + &RELOC(prom_opal_align)); + prom_debug("(rc = %ld) ", rc); + if (rc != 0) { + prom_printf("not there.\n"); + return; + } + RELOC(of_platform) = PLATFORM_OPAL; + prom_printf(" there !\n"); + prom_debug(" opal_size = 0x%lx\n", RELOC(prom_opal_size)); + prom_debug(" opal_align = 0x%lx\n", RELOC(prom_opal_align)); + if (RELOC(prom_opal_align) < 0x10000) + RELOC(prom_opal_align) = 0x10000; +} + +static int prom_rtas_call(int token, int nargs, int nret, int *outputs, ...) +{ + struct rtas_args rtas_args; + va_list list; + int i; + + rtas_args.token = token; + rtas_args.nargs = nargs; + rtas_args.nret = nret; + rtas_args.rets = (rtas_arg_t *)&(rtas_args.args[nargs]); + va_start(list, outputs); + for (i = 0; i < nargs; ++i) + rtas_args.args[i] = va_arg(list, rtas_arg_t); + va_end(list); + + for (i = 0; i < nret; ++i) + rtas_args.rets[i] = 0; + + opal_enter_rtas(&rtas_args, RELOC(prom_rtas_data), + RELOC(prom_rtas_entry)); + + if (nret > 1 && outputs != NULL) + for (i = 0; i < nret-1; ++i) + outputs[i] = rtas_args.rets[i+1]; + return (nret > 0)? rtas_args.rets[0]: 0; +} + +static void __init prom_opal_hold_cpus(void) +{ + int i, cnt, cpu, rc; + long j; + phandle node; + char type[64]; + u32 servers[8]; + struct prom_t *_prom = &RELOC(prom); + void *entry = (unsigned long *)&RELOC(opal_secondary_entry); + struct opal_secondary_data *data = &RELOC(opal_secondary_data); + + prom_debug("prom_opal_hold_cpus: start...\n"); + prom_debug(" - entry = 0x%x\n", entry); + prom_debug(" - data = 0x%x\n", data); + + data->ack = -1; + data->go = 0; + + /* look for cpus */ + for (node = 0; prom_next_node(&node); ) { + type[0] = 0; + prom_getprop(node, "device_type", type, sizeof(type)); + if (strcmp(type, RELOC("cpu")) != 0) + continue; + + /* Skip non-configured cpus. */ + if (prom_getprop(node, "status", type, sizeof(type)) > 0) + if (strcmp(type, RELOC("okay")) != 0) + continue; + + cnt = prom_getprop(node, "ibm,ppc-interrupt-server#s", servers, + sizeof(servers)); + if (cnt == PROM_ERROR) + break; + cnt >>= 2; + for (i = 0; i < cnt; i++) { + cpu = servers[i]; + prom_debug("CPU %d ... ", cpu); + if (cpu == _prom->cpu) { + prom_debug("booted !\n"); + continue; + } + prom_debug("starting ... "); + + /* Init the acknowledge var which will be reset by + * the secondary cpu when it awakens from its OF + * spinloop. + */ + data->ack = -1; + rc = prom_rtas_call(RELOC(prom_rtas_start_cpu), 3, 1, + NULL, cpu, entry, data); + prom_debug("rtas rc=%d ...", rc); + + for (j = 0; j < 100000000 && data->ack == -1; j++) { + HMT_low(); + mb(); + } + HMT_medium(); + if (data->ack != -1) + prom_debug("done, PIR=0x%x\n", data->ack); + else + prom_debug("timeout !\n"); + } + } + prom_debug("prom_opal_hold_cpus: end...\n"); +} + +static void prom_opal_takeover(void) +{ + struct opal_secondary_data *data = &RELOC(opal_secondary_data); + struct opal_takeover_args *args = &data->args; + u64 align = RELOC(prom_opal_align); + u64 top_addr, opal_addr; + + args->k_image = (u64)RELOC(_stext); + args->k_size = _end - _stext; + args->k_entry = 0; + args->k_entry2 = 0x60; + + top_addr = _ALIGN_UP(args->k_size, align); + + if (RELOC(prom_initrd_start) != 0) { + args->rd_image = RELOC(prom_initrd_start); + args->rd_size = RELOC(prom_initrd_end) - args->rd_image; + args->rd_loc = top_addr; + top_addr = _ALIGN_UP(args->rd_loc + args->rd_size, align); + } + + /* Pickup an address for the HAL. We want to go really high + * up to avoid problem with future kexecs. On the other hand + * we don't want to be all over the TCEs on P5IOC2 machines + * which are going to be up there too. We assume the machine + * has plenty of memory, and we ask for the HAL for now to + * be just below the 1G point, or above the initrd + */ + opal_addr = _ALIGN_DOWN(0x40000000 - RELOC(prom_opal_size), align); + if (opal_addr < top_addr) + opal_addr = top_addr; + args->hal_addr = opal_addr; + + prom_debug(" k_image = 0x%lx\n", args->k_image); + prom_debug(" k_size = 0x%lx\n", args->k_size); + prom_debug(" k_entry = 0x%lx\n", args->k_entry); + prom_debug(" k_entry2 = 0x%lx\n", args->k_entry2); + prom_debug(" hal_addr = 0x%lx\n", args->hal_addr); + prom_debug(" rd_image = 0x%lx\n", args->rd_image); + prom_debug(" rd_size = 0x%lx\n", args->rd_size); + prom_debug(" rd_loc = 0x%lx\n", args->rd_loc); + prom_printf("Performing OPAL takeover,this can take a few minutes..\n"); + prom_close_stdin(); + mb(); + data->go = 1; + for (;;) + opal_do_takeover(args); +} +#endif /* CONFIG_PPC_POWERNV */ /* * Allocate room for and instantiate RTAS @@ -1326,6 +1517,12 @@ static void __init prom_instantiate_rtas(void) prom_setprop(rtas_node, "/rtas", "linux,rtas-entry", &entry, sizeof(entry)); +#ifdef CONFIG_PPC_POWERNV + /* PowerVN takeover hack */ + RELOC(prom_rtas_data) = base; + RELOC(prom_rtas_entry) = entry; + prom_getprop(rtas_node, "start-cpu", &RELOC(prom_rtas_start_cpu), 4); +#endif prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); prom_debug("rtas size = 0x%x\n", (long)size); @@ -1543,7 +1740,7 @@ static void __init prom_hold_cpus(void) *acknowledge = (unsigned long)-1; if (reg != _prom->cpu) { - /* Primary Thread of non-boot cpu */ + /* Primary Thread of non-boot cpu or any thread */ prom_printf("starting cpu hw idx %lu... ", reg); call_prom("start-cpu", 3, 0, node, secondary_hold, reg); @@ -1652,15 +1849,6 @@ static void __init prom_init_stdout(void) prom_setprop(val, path, "linux,boot-display", NULL, 0); } -static void __init prom_close_stdin(void) -{ - struct prom_t *_prom = &RELOC(prom); - ihandle val; - - if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0) - call_prom("close", 1, 0, val); -} - static int __init prom_find_machine_type(void) { struct prom_t *_prom = &RELOC(prom); @@ -2504,6 +2692,7 @@ static void __init prom_check_initrd(unsigned long r3, unsigned long r4) #endif /* CONFIG_BLK_DEV_INITRD */ } + /* * We enter here early on, when the Open Firmware prom is still * handling exceptions and the MMU hash table for us. @@ -2565,7 +2754,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ prom_check_initrd(r3, r4); -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * On pSeries, inform the firmware about our capabilities */ @@ -2611,14 +2800,30 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, #endif /* - * On non-powermacs, try to instantiate RTAS and puts all CPUs - * in spin-loops. PowerMacs don't have a working RTAS and use - * a different way to spin CPUs + * On non-powermacs, try to instantiate RTAS. PowerMacs don't + * have a usable RTAS implementation. */ - if (RELOC(of_platform) != PLATFORM_POWERMAC) { + if (RELOC(of_platform) != PLATFORM_POWERMAC) prom_instantiate_rtas(); - prom_hold_cpus(); + +#ifdef CONFIG_PPC_POWERNV + /* Detect HAL and try instanciating it & doing takeover */ + if (RELOC(of_platform) == PLATFORM_PSERIES_LPAR) { + prom_query_opal(); + if (RELOC(of_platform) == PLATFORM_OPAL) { + prom_opal_hold_cpus(); + prom_opal_takeover(); + } } +#endif + + /* + * On non-powermacs, put all CPUs in spin-loops. + * + * PowerMacs use a different mechanism to spin CPUs + */ + if (RELOC(of_platform) != PLATFORM_POWERMAC) + prom_hold_cpus(); /* * Fill in some infos for use by the kernel later on diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 9f82f493789..20af6aada51 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -20,7 +20,8 @@ WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush _end enter_prom memcpy memset reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start strcmp strcpy strlcpy strlen strncmp strstr logo_linux_clut224 -reloc_got2 kernstart_addr memstart_addr linux_banner" +reloc_got2 kernstart_addr memstart_addr linux_banner _stext +opal_query_takeover opal_do_takeover opal_enter_rtas opal_secondary_entry" NM="$1" OBJ="$2" diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 1c4325056b8..497133027ae 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,2 +1,2 @@ -obj-y += setup.o +obj-y += setup.o opal-takeover.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S new file mode 100644 index 00000000000..77b48b2b930 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-takeover.S @@ -0,0 +1,140 @@ +/* + * PowerNV OPAL takeover assembly code, for use by prom_init.c + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#define STK_PARAM(i) (48 + ((i)-3)*8) + +#define H_HAL_TAKEOVER 0x5124 +#define H_HAL_TAKEOVER_QUERY_MAGIC -1 + + .text +_GLOBAL(opal_query_takeover) + mfcr r0 + stw r0,8(r1) + std r3,STK_PARAM(r3)(r1) + std r4,STK_PARAM(r4)(r1) + li r3,H_HAL_TAKEOVER + li r4,H_HAL_TAKEOVER_QUERY_MAGIC + HVSC + ld r10,STK_PARAM(r3)(r1) + std r4,0(r10) + ld r10,STK_PARAM(r4)(r1) + std r5,0(r10) + lwz r0,8(r1) + mtcrf 0xff,r0 + blr + +_GLOBAL(opal_do_takeover) + mfcr r0 + stw r0,8(r1) + mflr r0 + std r0,16(r1) + bl __opal_do_takeover + ld r0,16(r1) + mtlr r0 + lwz r0,8(r1) + mtcrf 0xff,r0 + blr + +__opal_do_takeover: + ld r4,0(r3) + ld r5,0x8(r3) + ld r6,0x10(r3) + ld r7,0x18(r3) + ld r8,0x20(r3) + ld r9,0x28(r3) + ld r10,0x30(r3) + ld r11,0x38(r3) + li r3,H_HAL_TAKEOVER + HVSC + blr + + .globl opal_secondary_entry +opal_secondary_entry: + mr r31,r3 + mfmsr r11 + li r12,(MSR_SF | MSR_ISF)@highest + sldi r12,r12,48 + or r11,r11,r12 + mtmsrd r11 + isync + mfspr r4,SPRN_PIR + std r4,0(r3) +1: HMT_LOW + ld r4,8(r3) + cmpli cr0,r4,0 + beq 1b + HMT_MEDIUM +1: addi r3,r31,16 + bl __opal_do_takeover + b 1b + +_GLOBAL(opal_enter_rtas) + mflr r0 + std r0,16(r1) + stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ + + /* Because PROM is running in 32b mode, it clobbers the high order half + * of all registers that it saves. We therefore save those registers + * PROM might touch to the stack. (r0, r3-r13 are caller saved) + */ + SAVE_GPR(2, r1) + SAVE_GPR(13, r1) + SAVE_8GPRS(14, r1) + SAVE_10GPRS(22, r1) + mfcr r10 + mfmsr r11 + std r10,_CCR(r1) + std r11,_MSR(r1) + + /* Get the PROM entrypoint */ + mtlr r5 + + /* Switch MSR to 32 bits mode + */ + li r12,1 + rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) + andc r11,r11,r12 + li r12,1 + rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) + andc r11,r11,r12 + mtmsrd r11 + isync + + /* Enter RTAS here... */ + blrl + + /* Just make sure that r1 top 32 bits didn't get + * corrupt by OF + */ + rldicl r1,r1,0,32 + + /* Restore the MSR (back to 64 bits) */ + ld r0,_MSR(r1) + MTMSRD(r0) + isync + + /* Restore other registers */ + REST_GPR(2, r1) + REST_GPR(13, r1) + REST_8GPRS(14, r1) + REST_10GPRS(22, r1) + ld r4,_CCR(r1) + mtcr r4 + + addi r1,r1,PROM_FRAME_SIZE + ld r0,16(r1) + mtlr r0 + blr -- cgit v1.2.3-70-g09d2 From 14a43e69ed257a1fadadf9fea2c05adb1686419f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:57 +0000 Subject: powerpc/powernv: Basic support for OPAL Add definition of OPAL interfaces along with the wrappers to call into OPAL runtime and the early device-tree parsing hook to locate the OPAL runtime firmware. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/firmware.h | 10 + arch/powerpc/include/asm/opal.h | 382 ++++++++++++++++++++++++- arch/powerpc/kernel/prom.c | 7 + arch/powerpc/platforms/powernv/Kconfig | 9 +- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/opal-wrappers.S | 101 +++++++ arch/powerpc/platforms/powernv/opal.c | 154 ++++++++++ arch/powerpc/platforms/powernv/setup.c | 6 + arch/powerpc/platforms/powernv/smp.c | 25 +- 9 files changed, 691 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-wrappers.S create mode 100644 arch/powerpc/platforms/powernv/opal.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 3a6c586c4e4..14db29b18d0 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -48,6 +48,8 @@ #define FW_FEATURE_CMO ASM_CONST(0x0000000002000000) #define FW_FEATURE_VPHN ASM_CONST(0x0000000004000000) #define FW_FEATURE_XCMO ASM_CONST(0x0000000008000000) +#define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000) +#define FW_FEATURE_OPALv2 ASM_CONST(0x0000000020000000) #ifndef __ASSEMBLY__ @@ -65,6 +67,8 @@ enum { FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, + FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv2, + FW_FEATURE_POWERNV_ALWAYS = 0, FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_CELLEB_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_BEAT, @@ -78,6 +82,9 @@ enum { #ifdef CONFIG_PPC_ISERIES FW_FEATURE_ISERIES_POSSIBLE | #endif +#ifdef CONFIG_PPC_POWERNV + FW_FEATURE_POWERNV_POSSIBLE | +#endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_POSSIBLE | #endif @@ -95,6 +102,9 @@ enum { #ifdef CONFIG_PPC_ISERIES FW_FEATURE_ISERIES_ALWAYS & #endif +#ifdef CONFIG_PPC_POWERNV + FW_FEATURE_POWERNV_ALWAYS & +#endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_ALWAYS & #endif diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index ecdb283f8b7..c7a3202d10a 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -37,14 +37,394 @@ extern long opal_query_takeover(u64 *hal_size, u64 *hal_align); extern long opal_do_takeover(struct opal_takeover_args *args); +struct rtas_args; extern int opal_enter_rtas(struct rtas_args *args, unsigned long data, unsigned long entry); - #endif /* __ASSEMBLY__ */ /****** OPAL APIs ******/ +/* Return codes */ +#define OPAL_SUCCESS 0 +#define OPAL_PARAMETER -1 +#define OPAL_BUSY -2 +#define OPAL_PARTIAL -3 +#define OPAL_CONSTRAINED -4 +#define OPAL_CLOSED -5 +#define OPAL_HARDWARE -6 +#define OPAL_UNSUPPORTED -7 +#define OPAL_PERMISSION -8 +#define OPAL_NO_MEM -9 +#define OPAL_RESOURCE -10 +#define OPAL_INTERNAL_ERROR -11 +#define OPAL_BUSY_EVENT -12 +#define OPAL_HARDWARE_FROZEN -13 + +/* API Tokens (in r0) */ +#define OPAL_CONSOLE_WRITE 1 +#define OPAL_CONSOLE_READ 2 +#define OPAL_RTC_READ 3 +#define OPAL_RTC_WRITE 4 +#define OPAL_CEC_POWER_DOWN 5 +#define OPAL_CEC_REBOOT 6 +#define OPAL_READ_NVRAM 7 +#define OPAL_WRITE_NVRAM 8 +#define OPAL_HANDLE_INTERRUPT 9 +#define OPAL_POLL_EVENTS 10 +#define OPAL_PCI_SET_HUB_TCE_MEMORY 11 +#define OPAL_PCI_SET_PHB_TCE_MEMORY 12 +#define OPAL_PCI_CONFIG_READ_BYTE 13 +#define OPAL_PCI_CONFIG_READ_HALF_WORD 14 +#define OPAL_PCI_CONFIG_READ_WORD 15 +#define OPAL_PCI_CONFIG_WRITE_BYTE 16 +#define OPAL_PCI_CONFIG_WRITE_HALF_WORD 17 +#define OPAL_PCI_CONFIG_WRITE_WORD 18 +#define OPAL_SET_XIVE 19 +#define OPAL_GET_XIVE 20 +#define OPAL_GET_COMPLETION_TOKEN_STATUS 21 /* obsolete */ +#define OPAL_REGISTER_OPAL_EXCEPTION_HANDLER 22 +#define OPAL_PCI_EEH_FREEZE_STATUS 23 +#define OPAL_PCI_SHPC 24 +#define OPAL_CONSOLE_WRITE_BUFFER_SPACE 25 +#define OPAL_PCI_EEH_FREEZE_CLEAR 26 +#define OPAL_PCI_PHB_MMIO_ENABLE 27 +#define OPAL_PCI_SET_PHB_MEM_WINDOW 28 +#define OPAL_PCI_MAP_PE_MMIO_WINDOW 29 +#define OPAL_PCI_SET_PHB_TABLE_MEMORY 30 +#define OPAL_PCI_SET_PE 31 +#define OPAL_PCI_SET_PELTV 32 +#define OPAL_PCI_SET_MVE 33 +#define OPAL_PCI_SET_MVE_ENABLE 34 +#define OPAL_PCI_GET_XIVE_REISSUE 35 +#define OPAL_PCI_SET_XIVE_REISSUE 36 +#define OPAL_PCI_SET_XIVE_PE 37 +#define OPAL_GET_XIVE_SOURCE 38 +#define OPAL_GET_MSI_32 39 +#define OPAL_GET_MSI_64 40 +#define OPAL_START_CPU 41 +#define OPAL_QUERY_CPU_STATUS 42 +#define OPAL_WRITE_OPPANEL 43 +#define OPAL_PCI_MAP_PE_DMA_WINDOW 44 +#define OPAL_PCI_MAP_PE_DMA_WINDOW_REAL 45 +#define OPAL_PCI_RESET 49 + +#ifndef __ASSEMBLY__ + +/* Other enums */ +enum OpalVendorApiTokens { + OPAL_START_VENDOR_API_RANGE = 1000, OPAL_END_VENDOR_API_RANGE = 1999 +}; +enum OpalFreezeState { + OPAL_EEH_STOPPED_NOT_FROZEN = 0, + OPAL_EEH_STOPPED_MMIO_FREEZE = 1, + OPAL_EEH_STOPPED_DMA_FREEZE = 2, + OPAL_EEH_STOPPED_MMIO_DMA_FREEZE = 3, + OPAL_EEH_STOPPED_RESET = 4, + OPAL_EEH_STOPPED_TEMP_UNAVAIL = 5, + OPAL_EEH_STOPPED_PERM_UNAVAIL = 6 +}; +enum OpalEehFreezeActionToken { + OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO = 1, + OPAL_EEH_ACTION_CLEAR_FREEZE_DMA = 2, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL = 3 +}; +enum OpalPciStatusToken { + OPAL_EEH_PHB_NO_ERROR = 0, + OPAL_EEH_PHB_FATAL = 1, + OPAL_EEH_PHB_RECOVERABLE = 2, + OPAL_EEH_PHB_BUS_ERROR = 3, + OPAL_EEH_PCI_NO_DEVSEL = 4, + OPAL_EEH_PCI_TA = 5, + OPAL_EEH_PCIEX_UR = 6, + OPAL_EEH_PCIEX_CA = 7, + OPAL_EEH_PCI_MMIO_ERROR = 8, + OPAL_EEH_PCI_DMA_ERROR = 9 +}; +enum OpalShpcAction { + OPAL_SHPC_GET_LINK_STATE = 0, + OPAL_SHPC_GET_SLOT_STATE = 1 +}; +enum OpalShpcLinkState { + OPAL_SHPC_LINK_DOWN = 0, + OPAL_SHPC_LINK_UP = 1 +}; +enum OpalMmioWindowType { + OPAL_M32_WINDOW_TYPE = 1, + OPAL_M64_WINDOW_TYPE = 2, + OPAL_IO_WINDOW_TYPE = 3 +}; +enum OpalShpcSlotState { + OPAL_SHPC_DEV_NOT_PRESENT = 0, + OPAL_SHPC_DEV_PRESENT = 1 +}; +enum OpalExceptionHandler { + OPAL_MACHINE_CHECK_HANDLER = 1, + OPAL_HYPERVISOR_MAINTENANCE_HANDLER = 2, + OPAL_SOFTPATCH_HANDLER = 3 +}; +enum OpalPendingState { + OPAL_EVENT_OPAL_INTERNAL = 0x1, + OPAL_EVENT_NVRAM = 0x2, + OPAL_EVENT_RTC = 0x4, + OPAL_EVENT_CONSOLE_OUTPUT = 0x8, + OPAL_EVENT_CONSOLE_INPUT = 0x10 +}; + +/* Machine check related definitions */ +enum OpalMCE_Version { + OpalMCE_V1 = 1, +}; + +enum OpalMCE_Severity { + OpalMCE_SEV_NO_ERROR = 0, + OpalMCE_SEV_WARNING = 1, + OpalMCE_SEV_ERROR_SYNC = 2, + OpalMCE_SEV_FATAL = 3, +}; + +enum OpalMCE_Disposition { + OpalMCE_DISPOSITION_RECOVERED = 0, + OpalMCE_DISPOSITION_NOT_RECOVERED = 1, +}; + +enum OpalMCE_Initiator { + OpalMCE_INITIATOR_UNKNOWN = 0, + OpalMCE_INITIATOR_CPU = 1, +}; + +enum OpalMCE_ErrorType { + OpalMCE_ERROR_TYPE_UNKNOWN = 0, + OpalMCE_ERROR_TYPE_UE = 1, + OpalMCE_ERROR_TYPE_SLB = 2, + OpalMCE_ERROR_TYPE_ERAT = 3, + OpalMCE_ERROR_TYPE_TLB = 4, +}; + +enum OpalMCE_UeErrorType { + OpalMCE_UE_ERROR_INDETERMINATE = 0, + OpalMCE_UE_ERROR_IFETCH = 1, + OpalMCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH = 2, + OpalMCE_UE_ERROR_LOAD_STORE = 3, + OpalMCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 4, +}; + +enum OpalMCE_SlbErrorType { + OpalMCE_SLB_ERROR_INDETERMINATE = 0, + OpalMCE_SLB_ERROR_PARITY = 1, + OpalMCE_SLB_ERROR_MULTIHIT = 2, +}; + +enum OpalMCE_EratErrorType { + OpalMCE_ERAT_ERROR_INDETERMINATE = 0, + OpalMCE_ERAT_ERROR_PARITY = 1, + OpalMCE_ERAT_ERROR_MULTIHIT = 2, +}; + +enum OpalMCE_TlbErrorType { + OpalMCE_TLB_ERROR_INDETERMINATE = 0, + OpalMCE_TLB_ERROR_PARITY = 1, + OpalMCE_TLB_ERROR_MULTIHIT = 2, +}; + +enum OpalThreadStatus { + OPAL_THREAD_INACTIVE = 0x0, + OPAL_THREAD_STARTED = 0x1 +}; + +enum OpalPciBusCompare { + OpalPciBusAny = 0, /* Any bus number match */ + OpalPciBus3Bits = 2, /* Match top 3 bits of bus number */ + OpalPciBus4Bits = 3, /* Match top 4 bits of bus number */ + OpalPciBus5Bits = 4, /* Match top 5 bits of bus number */ + OpalPciBus6Bits = 5, /* Match top 6 bits of bus number */ + OpalPciBus7Bits = 6, /* Match top 7 bits of bus number */ + OpalPciBusAll = 7, /* Match bus number exactly */ +}; + +enum OpalDeviceCompare { + OPAL_IGNORE_RID_DEVICE_NUMBER = 0, + OPAL_COMPARE_RID_DEVICE_NUMBER = 1 +}; + +enum OpalFuncCompare { + OPAL_IGNORE_RID_FUNCTION_NUMBER = 0, + OPAL_COMPARE_RID_FUNCTION_NUMBER = 1 +}; + +enum OpalPeAction { + OPAL_UNMAP_PE = 0, + OPAL_MAP_PE = 1 +}; + +enum OpalPciResetAndReinitScope { + OPAL_PHB_COMPLETE = 1, OPAL_PCI_LINK = 2, OPAL_PHB_ERROR = 3, + OPAL_PCI_HOT_RESET = 4, OPAL_PCI_FUNDAMENTAL_RESET = 5, + OPAL_PCI_IODA_RESET = 6, +}; + +enum OpalPciResetState { OPAL_DEASSERT_RESET = 0, OPAL_ASSERT_RESET = 1 }; + +struct opal_machine_check_event { + enum OpalMCE_Version version:8; /* 0x00 */ + uint8_t in_use; /* 0x01 */ + enum OpalMCE_Severity severity:8; /* 0x02 */ + enum OpalMCE_Initiator initiator:8; /* 0x03 */ + enum OpalMCE_ErrorType error_type:8; /* 0x04 */ + enum OpalMCE_Disposition disposition:8; /* 0x05 */ + uint8_t reserved_1[2]; /* 0x06 */ + uint64_t gpr3; /* 0x08 */ + uint64_t srr0; /* 0x10 */ + uint64_t srr1; /* 0x18 */ + union { /* 0x20 */ + struct { + enum OpalMCE_UeErrorType ue_error_type:8; + uint8_t effective_address_provided; + uint8_t physical_address_provided; + uint8_t reserved_1[5]; + uint64_t effective_address; + uint64_t physical_address; + uint8_t reserved_2[8]; + } ue_error; + + struct { + enum OpalMCE_SlbErrorType slb_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } slb_error; + + struct { + enum OpalMCE_EratErrorType erat_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } erat_error; + + struct { + enum OpalMCE_TlbErrorType tlb_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } tlb_error; + } u; +}; + +typedef struct oppanel_line { + /* XXX */ +} oppanel_line_t; + +/* API functions */ +int64_t opal_console_write(int64_t term_number, int64_t *length, + const uint8_t *buffer); +int64_t opal_console_read(int64_t term_number, int64_t *length, + uint8_t *buffer); +int64_t opal_console_write_buffer_space(int64_t term_number, + int64_t *length); +int64_t opal_rtc_read(uint32_t *year_month_day, + uint64_t *hour_minute_second_millisecond); +int64_t opal_rtc_write(uint32_t year_month_day, + uint64_t hour_minute_second_millisecond); +int64_t opal_cec_power_down(uint64_t request); +int64_t opal_cec_reboot(void); +int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset); +int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset); +int64_t opal_handle_interrupt(uint64_t isn, uint64_t *outstanding_event_mask); +int64_t opal_poll_events(uint64_t *outstanding_event_mask); +int64_t opal_pci_set_hub_tce_memory(uint64_t hub_id, uint64_t tce_mem_addr, + uint64_t tce_mem_size); +int64_t opal_pci_set_phb_tce_memory(uint64_t phb_id, uint64_t tce_mem_addr, + uint64_t tce_mem_size); +int64_t opal_pci_config_read_byte(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint8_t *data); +int64_t opal_pci_config_read_half_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint16_t *data); +int64_t opal_pci_config_read_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint32_t *data); +int64_t opal_pci_config_write_byte(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint8_t data); +int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint16_t data); +int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint32_t data); +int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority); +int64_t opal_get_xive(uint32_t isn, uint16_t *server, uint8_t *priority); +int64_t opal_register_exception_handler(uint64_t opal_exception, + uint64_t handler_address, + uint64_t glue_cache_line); +int64_t opal_pci_eeh_freeze_status(uint64_t phb_id, uint64_t pe_number, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint64_t *phb_status); +int64_t opal_pci_eeh_freeze_clear(uint64_t phb_id, uint64_t pe_number, + uint64_t eeh_action_token); +int64_t opal_pci_shpc(uint64_t phb_id, uint64_t shpc_action, uint8_t *state); + + + +int64_t opal_pci_phb_mmio_enable(uint64_t phb_id, uint16_t window_type, + uint16_t window_num, uint16_t enable); +int64_t opal_pci_set_phb_mem_window(uint64_t phb_id, uint16_t window_type, + uint16_t window_num, + uint64_t starting_real_address, + uint64_t starting_pci_address, + uint16_t segment_size); +int64_t opal_pci_map_pe_mmio_window(uint64_t phb_id, uint16_t pe_number, + uint16_t window_type, uint16_t window_num, + uint16_t segment_num); +int64_t opal_pci_set_phb_table_memory(uint64_t phb_id, uint64_t rtt_addr, + uint64_t ivt_addr, uint64_t ivt_len, + uint64_t reject_array_addr, + uint64_t peltv_addr); +int64_t opal_pci_set_pe(uint64_t phb_id, uint64_t pe_number, uint64_t bus_dev_func, + uint8_t bus_compare, uint8_t dev_compare, uint8_t func_compare, + uint8_t pe_action); +int64_t opal_pci_set_peltv(uint64_t phb_id, uint32_t parent_pe, uint32_t child_pe, + uint8_t state); +int64_t opal_pci_set_mve(uint64_t phb_id, uint32_t mve_number, uint32_t pe_number); +int64_t opal_pci_set_mve_enable(uint64_t phb_id, uint32_t mve_number, + uint32_t state); +int64_t opal_pci_get_xive_reissue(uint64_t phb_id, uint32_t xive_number, + uint8_t *p_bit, uint8_t *q_bit); +int64_t opal_pci_set_xive_reissue(uint64_t phb_id, uint32_t xive_number, + uint8_t p_bit, uint8_t q_bit); +int64_t opal_pci_set_xive_pe(uint64_t phb_id, uint32_t pe_number, + uint32_t xive_num); +int64_t opal_get_xive_source(uint64_t phb_id, uint32_t xive_num, + int32_t *interrupt_source_number); +int64_t opal_get_msi_32(uint64_t phb_id, uint32_t mve_number, uint32_t xive_num, + uint8_t msi_range, uint32_t *msi_address, + uint32_t *message_data); +int64_t opal_get_msi_64(uint64_t phb_id, uint32_t mve_number, + uint32_t xive_num, uint8_t msi_range, + uint64_t *msi_address, uint32_t *message_data); +int64_t opal_start_cpu(uint64_t thread_number, uint64_t start_address); +int64_t opal_query_cpu_status(uint64_t thread_number, uint8_t *thread_status); +int64_t opal_write_oppanel(oppanel_line_t *lines, uint64_t num_lines); +int64_t opal_pci_map_pe_dma_window(uint64_t phb_id, uint16_t pe_number, uint16_t window_id, + uint16_t tce_levels, uint64_t tce_table_addr, + uint64_t tce_table_size, uint64_t tce_page_size); +int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number, + uint16_t dma_window_number, uint64_t pci_start_addr, + uint64_t pci_mem_size); +int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope, uint8_t assert_state); + +/* Internal functions */ +extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); + +extern int opal_get_chars(uint32_t vtermno, char *buf, int count); +extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); + +extern void hvc_opal_init_early(void); + +/* Internal functions */ +extern int early_init_dt_scan_opal(unsigned long node, const char *uname, + int depth, void *data); + +#endif /* __ASSEMBLY__ */ #endif /* __OPAL_H */ diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7b90c564e93..831a201e03d 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -54,6 +54,8 @@ #include #include #include +#include + #include #ifdef DEBUG @@ -707,6 +709,11 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_rtas, NULL); #endif +#ifdef CONFIG_PPC_POWERNV + /* Some machines might need OPAL info for debugging, grab it now. */ + of_scan_flat_dt(early_init_dt_scan_opal, NULL); +#endif + #ifdef CONFIG_PHYP_DUMP /* scan tree to see if dump occurred during last boot */ of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL); diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 24700e8df19..74fea5c2183 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -1,11 +1,16 @@ config PPC_POWERNV depends on PPC64 && PPC_BOOK3S bool "IBM PowerNV (Non-Virtualized) platform support" - select PPC_RTAS select PPC_NATIVE select PPC_XICS select PPC_ICP_NATIVE - select PPC_ICS_RTAS select PPC_P7_NAP select PPC_PCI_CHOICE if EMBEDDED default y + +config PPC_POWERNV_RTAS + depends on PPC_POWERNV + bool "Support for RTAS based PowerNV platforms such as BML" + default y + select PPC_ICS_RTAS + select PPC_RTAS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 497133027ae..8f69c0db612 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,2 +1,2 @@ -obj-y += setup.o opal-takeover.o +obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S new file mode 100644 index 00000000000..4a3f46d8533 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -0,0 +1,101 @@ +/* + * PowerNV OPAL API wrappers + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +/* TODO: + * + * - Trace irqs in/off (needs saving/restoring all args, argh...) + * - Get r11 feed up by Dave so I can have better register usage + */ +#define OPAL_CALL(name, token) \ + _GLOBAL(name); \ + mflr r0; \ + mfcr r12; \ + std r0,16(r1); \ + std r12,8(r1); \ + std r1,PACAR1(r13); \ + li r0,0; \ + mfmsr r12; \ + ori r0,r0,MSR_EE; \ + std r12,PACASAVEDMSR(r13); \ + andc r12,r12,r0; \ + mtmsrd r12,1; \ + LOAD_REG_ADDR(r0,.opal_return); \ + mtlr r0; \ + li r0,MSR_DR|MSR_IR; \ + andc r12,r12,r0; \ + li r0,token; \ + mtspr SPRN_HSRR1,r12; \ + LOAD_REG_ADDR(r11,opal); \ + ld r12,8(r11); \ + ld r2,0(r11); \ + mtspr SPRN_HSRR0,r12; \ + hrfid + +_STATIC(opal_return) + ld r2,PACATOC(r13); + ld r4,8(r1); + ld r5,16(r1); + ld r6,PACASAVEDMSR(r13); + mtspr SPRN_SRR0,r5; + mtspr SPRN_SRR1,r6; + mtcr r4; + rfid + +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c new file mode 100644 index 00000000000..8d5510784cc --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal.c @@ -0,0 +1,154 @@ +/* + * PowerNV OPAL high level interfaces + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include + +#include "powernv.h" + +struct opal { + u64 base; + u64 entry; +} opal; + +static struct device_node *opal_node; +static DEFINE_SPINLOCK(opal_write_lock); + +int __init early_init_dt_scan_opal(unsigned long node, + const char *uname, int depth, void *data) +{ + const void *basep, *entryp; + unsigned long basesz, entrysz; + + if (depth != 1 || strcmp(uname, "ibm,opal") != 0) + return 0; + + basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz); + entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz); + + if (!basep || !entryp) + return 1; + + opal.base = of_read_number(basep, basesz/4); + opal.entry = of_read_number(entryp, entrysz/4); + + pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%ld)\n", + opal.base, basep, basesz); + pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%ld)\n", + opal.entry, entryp, entrysz); + + powerpc_firmware_features |= FW_FEATURE_OPAL; + if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { + powerpc_firmware_features |= FW_FEATURE_OPALv2; + printk("OPAL V2 detected !\n"); + } else { + printk("OPAL V1 detected !\n"); + } + + return 1; +} + +int opal_get_chars(uint32_t vtermno, char *buf, int count) +{ + s64 len, rc; + u64 evt; + + if (!opal.entry) + return 0; + opal_poll_events(&evt); + if ((evt & OPAL_EVENT_CONSOLE_INPUT) == 0) + return 0; + len = count; + rc = opal_console_read(vtermno, &len, buf); + if (rc == OPAL_SUCCESS) + return len; + return 0; +} + +int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +{ + int written = 0; + s64 len, rc = OPAL_BUSY; + unsigned long flags; + u64 evt; + + if (!opal.entry) + return 0; + + /* We want put_chars to be atomic to avoid mangling of hvsi + * packets. To do that, we first test for room and return + * -EAGAIN if there isn't enough + */ + spin_lock_irqsave(&opal_write_lock, flags); + rc = opal_console_write_buffer_space(vtermno, &len); + if (rc || len < total_len) { + spin_unlock_irqrestore(&opal_write_lock, flags); + /* Closed -> drop characters */ + if (rc) + return total_len; + opal_poll_events(&evt); + return -EAGAIN; + } + + /* We still try to handle partial completions, though they + * should no longer happen. + */ + while(total_len > 0 && (rc == OPAL_BUSY || + rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { + len = total_len; + rc = opal_console_write(vtermno, &len, data); + if (rc == OPAL_SUCCESS) { + total_len -= len; + data += len; + written += len; + } + /* This is a bit nasty but we need that for the console to + * flush when there aren't any interrupts. We will clean + * things a bit later to limit that to synchronous path + * such as the kernel console and xmon/udbg + */ + do + opal_poll_events(&evt); + while(rc == OPAL_SUCCESS && (evt & OPAL_EVENT_CONSOLE_OUTPUT)); + } + spin_unlock_irqrestore(&opal_write_lock, flags); + return written; +} + +static int __init opal_init(void) +{ + struct device_node *np, *consoles; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_warn("opal: Node not found\n"); + return -ENODEV; + } + if (firmware_has_feature(FW_FEATURE_OPALv2)) + consoles = of_find_node_by_path("/ibm,opal/consoles"); + else + consoles = of_node_get(opal_node); + + /* Register serial ports */ + for_each_child_of_node(consoles, np) { + if (strcmp(np->name, "serial")) + continue; + of_platform_device_create(np, NULL, NULL); + } + of_node_put(consoles); + return 0; +} +subsys_initcall(opal_init); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 569f9cc4eb0..b6e5ff85cc6 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -74,6 +74,12 @@ static void pnv_show_cpuinfo(struct seq_file *m) if (root) model = of_get_property(root, "model", NULL); seq_printf(m, "machine\t\t: PowerNV %s\n", model); + if (firmware_has_feature(FW_FEATURE_OPALv2)) + seq_printf(m, "firmware\t: OPAL v2\n"); + else if (firmware_has_feature(FW_FEATURE_OPAL)) + seq_printf(m, "firmware\t: OPAL v1\n"); + else + seq_printf(m, "firmware\t: BML\n"); of_node_put(root); } diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 4f4ec3797eb..e8773668524 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "powernv.h" @@ -62,6 +63,28 @@ static int pnv_smp_cpu_bootable(unsigned int nr) return 1; } +int __devinit pnv_smp_kick_cpu(int nr) +{ + unsigned int pcpu = get_hard_smp_processor_id(nr); + unsigned long start_here = __pa(*((unsigned long *) + generic_secondary_smp_init)); + long rc; + + BUG_ON(nr < 0 || nr >= NR_CPUS); + + /* On OPAL v2 the CPU are still spinning inside OPAL itself, + * get them back now + */ + if (firmware_has_feature(FW_FEATURE_OPALv2)) { + pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu); + rc = opal_start_cpu(pcpu, start_here); + if (rc != OPAL_SUCCESS) + pr_warn("OPAL Error %ld starting CPU %d\n", + rc, nr); + } + return smp_generic_kick_cpu(nr); +} + #ifdef CONFIG_HOTPLUG_CPU static int pnv_smp_cpu_disable(void) @@ -127,7 +150,7 @@ static struct smp_ops_t pnv_smp_ops = { .message_pass = smp_muxed_ipi_message_pass, .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ .probe = xics_smp_probe, - .kick_cpu = smp_generic_kick_cpu, + .kick_cpu = pnv_smp_kick_cpu, .setup_cpu = pnv_smp_setup_cpu, .cpu_bootable = pnv_smp_cpu_bootable, #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3-70-g09d2 From daea1175a9f0f70eab5b33e2827d57ba8c686816 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:59 +0000 Subject: powerpc/powernv: Support for OPAL console This adds a udbg and an hvc console backend for supporting a console using the OPAL console interfaces. On OPAL v1 we have hvc0 mapped to whatever console the system was configured for (network or hvsi serial port) via the service processor. On OPAL v2 we have hvcN mapped to the Nth console provided by OPAL which generally corresponds to: hvc0 : network console (raw protocol) hvc1 : serial port S1 (hvsi) hvc2 : serial port S2 (hvsi) Note: At this point, early debug console only works with OPAL v1 and shouldn't be enabled in a normal kernel. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig.debug | 31 +++ arch/powerpc/include/asm/opal.h | 5 + arch/powerpc/include/asm/udbg.h | 2 + arch/powerpc/kernel/head_64.S | 14 +- arch/powerpc/kernel/udbg.c | 4 + arch/powerpc/platforms/powernv/opal.c | 31 ++- arch/powerpc/platforms/powernv/setup.c | 14 +- drivers/tty/hvc/Kconfig | 9 + drivers/tty/hvc/Makefile | 1 + drivers/tty/hvc/hvc_opal.c | 424 +++++++++++++++++++++++++++++++++ drivers/tty/hvc/hvsi_lib.c | 4 +- 11 files changed, 517 insertions(+), 22 deletions(-) create mode 100644 drivers/tty/hvc/hvc_opal.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 06eb62b0fd9..1b8a9c905cf 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -265,8 +265,27 @@ config PPC_EARLY_DEBUG_PS3GELIC Select this to enable early debugging for the PlayStation3 via UDP broadcasts sent out through the Ethernet port. +config PPC_EARLY_DEBUG_OPAL_RAW + bool "OPAL raw console" + depends on HVC_OPAL + help + Select this to enable early debugging for the PowerNV platform + using a "raw" console + +config PPC_EARLY_DEBUG_OPAL_HVSI + bool "OPAL hvsi console" + depends on HVC_OPAL + help + Select this to enable early debugging for the PowerNV platform + using an "hvsi" console + endchoice +config PPC_EARLY_DEBUG_OPAL + def_bool y + depends on PPC_EARLY_DEBUG_OPAL_RAW || PPC_EARLY_DEBUG_OPAL_HVSI + + config PPC_EARLY_DEBUG_HVSI_VTERMNO hex "vterm number to use with early debug HVSI" depends on PPC_EARLY_DEBUG_LPAR_HVSI @@ -275,6 +294,18 @@ config PPC_EARLY_DEBUG_HVSI_VTERMNO You probably want 0x30000000 for your first serial port and 0x30000001 for your second one +config PPC_EARLY_DEBUG_OPAL_VTERMNO + hex "vterm number to use with OPAL early debug" + depends on PPC_EARLY_DEBUG_OPAL + default "0" + help + This correspond to which /dev/hvcN you want to use for early + debug. + + On OPAL v1 (takeover) this should always be 0 + On OPAL v2, this will be 0 for network console and 1 or 2 for + the machine built-in serial ports. + config PPC_EARLY_DEBUG_44x_PHYSLOW hex "Low 32 bits of early debug UART physical address" depends on PPC_EARLY_DEBUG_44x diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index c7a3202d10a..749de00a02d 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -425,6 +425,11 @@ extern void hvc_opal_init_early(void); extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); +extern int opal_get_chars(uint32_t vtermno, char *buf, int count); +extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); + +extern void hvc_opal_init_early(void); + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_H */ diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 7cf796fa03f..6587ec7bc6e 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -55,6 +55,8 @@ extern void __init udbg_init_cpm(void); extern void __init udbg_init_usbgecko(void); extern void __init udbg_init_wsp(void); extern void __init udbg_init_ps3gelic(void); +extern void __init udbg_init_debug_opal_raw(void); +extern void __init udbg_init_debug_opal_hvsi(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_UDBG_H */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index dea8191253d..06c7251c1bf 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -53,7 +53,8 @@ * 2. The kernel is entered at __start * -or- For OPAL entry: * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 - * with device-tree in gpr3 + * with device-tree in gpr3. We also get OPAL base in r8 and + * entry in r9 for debugging purposes * 2. Secondary processors enter at 0x60 with PIR in gpr3 * * For iSeries: @@ -335,6 +336,11 @@ _GLOBAL(__start_initialization_multiplatform) /* Save parameters */ mr r31,r3 mr r30,r4 +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* Save OPAL entry */ + mr r28,r8 + mr r29,r9 +#endif #ifdef CONFIG_PPC_BOOK3E bl .start_initialization_book3e @@ -711,6 +717,12 @@ _INIT_STATIC(start_here_multiplatform) bdnz 3b 4: +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* Setup OPAL entry */ + std r28,0(r11); + std r29,8(r11); +#endif + #ifndef CONFIG_PPC_BOOK3E mfmsr r6 ori r6,r6,MSR_RI diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 5b3e98e0315..35f948203ec 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -69,6 +69,10 @@ void __init udbg_early_init(void) udbg_init_wsp(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC) udbg_init_ps3gelic(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_RAW) + udbg_init_debug_opal_raw(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI) + udbg_init_debug_opal_hvsi(); #endif #ifdef CONFIG_PPC_EARLY_DEBUG diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 8d5510784cc..7887733b9b3 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -67,7 +67,7 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count) u64 evt; if (!opal.entry) - return 0; + return -ENODEV; opal_poll_events(&evt); if ((evt & OPAL_EVENT_CONSOLE_INPUT) == 0) return 0; @@ -81,31 +81,38 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count) int opal_put_chars(uint32_t vtermno, const char *data, int total_len) { int written = 0; - s64 len, rc = OPAL_BUSY; + s64 len, rc; unsigned long flags; u64 evt; if (!opal.entry) - return 0; + return -ENODEV; /* We want put_chars to be atomic to avoid mangling of hvsi * packets. To do that, we first test for room and return - * -EAGAIN if there isn't enough + * -EAGAIN if there isn't enough. + * + * Unfortunately, opal_console_write_buffer_space() doesn't + * appear to work on opal v1, so we just assume there is + * enough room and be done with it */ spin_lock_irqsave(&opal_write_lock, flags); - rc = opal_console_write_buffer_space(vtermno, &len); - if (rc || len < total_len) { - spin_unlock_irqrestore(&opal_write_lock, flags); - /* Closed -> drop characters */ - if (rc) - return total_len; - opal_poll_events(&evt); - return -EAGAIN; + if (firmware_has_feature(FW_FEATURE_OPALv2)) { + rc = opal_console_write_buffer_space(vtermno, &len); + if (rc || len < total_len) { + spin_unlock_irqrestore(&opal_write_lock, flags); + /* Closed -> drop characters */ + if (rc) + return total_len; + opal_poll_events(&evt); + return -EAGAIN; + } } /* We still try to handle partial completions, though they * should no longer happen. */ + rc = OPAL_BUSY; while(total_len > 0 && (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { len = total_len; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index b6e5ff85cc6..07ba1ecd180 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -29,17 +29,12 @@ #include #include #include +#include #include "powernv.h" static void __init pnv_setup_arch(void) { - /* Force console to hvc for now until we have sorted out the - * real console situation for the platform. This will make - * hvc_udbg work at least. - */ - add_preferred_console("hvc", 0, NULL); - /* Initialize SMP */ pnv_smp_init(); @@ -55,7 +50,12 @@ static void __init pnv_setup_arch(void) static void __init pnv_init_early(void) { - /* XXX IOMMU */ +#ifdef CONFIG_HVC_OPAL + if (firmware_has_feature(FW_FEATURE_OPAL)) + hvc_opal_init_early(); + else +#endif + add_preferred_console("hvc", 0, NULL); } static void __init pnv_init_IRQ(void) diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig index e371753ba92..4222035acfb 100644 --- a/drivers/tty/hvc/Kconfig +++ b/drivers/tty/hvc/Kconfig @@ -34,6 +34,15 @@ config HVC_ISERIES help iSeries machines support a hypervisor virtual console. +config HVC_OPAL + bool "OPAL Console support" + depends on PPC_POWERNV + select HVC_DRIVER + select HVC_IRQ + default y + help + PowerNV machines running under OPAL need that driver to get a console + config HVC_RTAS bool "IBM RTAS Console support" depends on PPC_RTAS diff --git a/drivers/tty/hvc/Makefile b/drivers/tty/hvc/Makefile index e2920531637..89abf40bc73 100644 --- a/drivers/tty/hvc/Makefile +++ b/drivers/tty/hvc/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi_lib.o +obj-$(CONFIG_HVC_OPAL) += hvc_opal.o hvsi_lib.o obj-$(CONFIG_HVC_OLD_HVSI) += hvsi.o obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c new file mode 100644 index 00000000000..7b38512d6c4 --- /dev/null +++ b/drivers/tty/hvc/hvc_opal.c @@ -0,0 +1,424 @@ +/* + * opal driver interface to hvc_console.c + * + * Copyright 2011 Benjamin Herrenschmidt , IBM Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hvc_console.h" + +static const char hvc_opal_name[] = "hvc_opal"; + +static struct of_device_id hvc_opal_match[] __devinitdata = { + { .name = "serial", .compatible = "ibm,opal-console-raw" }, + { .name = "serial", .compatible = "ibm,opal-console-hvsi" }, + { }, +}; + +typedef enum hv_protocol { + HV_PROTOCOL_RAW, + HV_PROTOCOL_HVSI +} hv_protocol_t; + +struct hvc_opal_priv { + hv_protocol_t proto; /* Raw data or HVSI packets */ + struct hvsi_priv hvsi; /* HVSI specific data */ +}; +static struct hvc_opal_priv *hvc_opal_privs[MAX_NR_HVC_CONSOLES]; + +/* For early boot console */ +static struct hvc_opal_priv hvc_opal_boot_priv; +static u32 hvc_opal_boot_termno; + +static const struct hv_ops hvc_opal_raw_ops = { + .get_chars = opal_get_chars, + .put_chars = opal_put_chars, + .notifier_add = notifier_add_irq, + .notifier_del = notifier_del_irq, + .notifier_hangup = notifier_hangup_irq, +}; + +static int hvc_opal_hvsi_get_chars(uint32_t vtermno, char *buf, int count) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[vtermno]; + + if (WARN_ON(!pv)) + return -ENODEV; + + return hvsilib_get_chars(&pv->hvsi, buf, count); +} + +static int hvc_opal_hvsi_put_chars(uint32_t vtermno, const char *buf, int count) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[vtermno]; + + if (WARN_ON(!pv)) + return -ENODEV; + + return hvsilib_put_chars(&pv->hvsi, buf, count); +} + +static int hvc_opal_hvsi_open(struct hvc_struct *hp, int data) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + int rc; + + pr_devel("HVSI@%x: do open !\n", hp->vtermno); + + rc = notifier_add_irq(hp, data); + if (rc) + return rc; + + return hvsilib_open(&pv->hvsi, hp); +} + +static void hvc_opal_hvsi_close(struct hvc_struct *hp, int data) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + pr_devel("HVSI@%x: do close !\n", hp->vtermno); + + hvsilib_close(&pv->hvsi, hp); + + notifier_del_irq(hp, data); +} + +void hvc_opal_hvsi_hangup(struct hvc_struct *hp, int data) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + pr_devel("HVSI@%x: do hangup !\n", hp->vtermno); + + hvsilib_close(&pv->hvsi, hp); + + notifier_hangup_irq(hp, data); +} + +static int hvc_opal_hvsi_tiocmget(struct hvc_struct *hp) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + if (!pv) + return -EINVAL; + return pv->hvsi.mctrl; +} + +static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set, + unsigned int clear) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + pr_devel("HVSI@%x: Set modem control, set=%x,clr=%x\n", + hp->vtermno, set, clear); + + if (set & TIOCM_DTR) + hvsilib_write_mctrl(&pv->hvsi, 1); + else if (clear & TIOCM_DTR) + hvsilib_write_mctrl(&pv->hvsi, 0); + + return 0; +} + +static const struct hv_ops hvc_opal_hvsi_ops = { + .get_chars = hvc_opal_hvsi_get_chars, + .put_chars = hvc_opal_hvsi_put_chars, + .notifier_add = hvc_opal_hvsi_open, + .notifier_del = hvc_opal_hvsi_close, + .notifier_hangup = hvc_opal_hvsi_hangup, + .tiocmget = hvc_opal_hvsi_tiocmget, + .tiocmset = hvc_opal_hvsi_tiocmset, +}; + +static int __devinit hvc_opal_probe(struct platform_device *dev) +{ + const struct hv_ops *ops; + struct hvc_struct *hp; + struct hvc_opal_priv *pv; + hv_protocol_t proto; + unsigned int termno, boot = 0; + const __be32 *reg; + + if (of_device_is_compatible(dev->dev.of_node, "ibm,opal-console-raw")) { + proto = HV_PROTOCOL_RAW; + ops = &hvc_opal_raw_ops; + } else if (of_device_is_compatible(dev->dev.of_node, + "ibm,opal-console-hvsi")) { + proto = HV_PROTOCOL_HVSI; + ops = &hvc_opal_hvsi_ops; + } else { + pr_err("hvc_opal: Unkown protocol for %s\n", + dev->dev.of_node->full_name); + return -ENXIO; + } + + reg = of_get_property(dev->dev.of_node, "reg", NULL); + termno = reg ? be32_to_cpup(reg) : 0; + + /* Is it our boot one ? */ + if (hvc_opal_privs[termno] == &hvc_opal_boot_priv) { + pv = hvc_opal_privs[termno]; + boot = 1; + } else if (hvc_opal_privs[termno] == NULL) { + pv = kzalloc(sizeof(struct hvc_opal_priv), GFP_KERNEL); + if (!pv) + return -ENOMEM; + pv->proto = proto; + hvc_opal_privs[termno] = pv; + if (proto == HV_PROTOCOL_HVSI) + hvsilib_init(&pv->hvsi, opal_get_chars, opal_put_chars, + termno, 0); + + /* Instanciate now to establish a mapping index==vtermno */ + hvc_instantiate(termno, termno, ops); + } else { + pr_err("hvc_opal: Device %s has duplicate terminal number #%d\n", + dev->dev.of_node->full_name, termno); + return -ENXIO; + } + + pr_info("hvc%d: %s protocol on %s%s\n", termno, + proto == HV_PROTOCOL_RAW ? "raw" : "hvsi", + dev->dev.of_node->full_name, + boot ? " (boot console)" : ""); + + /* We don't do IRQ yet */ + hp = hvc_alloc(termno, 0, ops, MAX_VIO_PUT_CHARS); + if (IS_ERR(hp)) + return PTR_ERR(hp); + dev_set_drvdata(&dev->dev, hp); + + return 0; +} + +static int __devexit hvc_opal_remove(struct platform_device *dev) +{ + struct hvc_struct *hp = dev_get_drvdata(&dev->dev); + int rc, termno; + + termno = hp->vtermno; + rc = hvc_remove(hp); + if (rc == 0) { + if (hvc_opal_privs[termno] != &hvc_opal_boot_priv) + kfree(hvc_opal_privs[termno]); + hvc_opal_privs[termno] = NULL; + } + return rc; +} + +static struct platform_driver hvc_opal_driver = { + .probe = hvc_opal_probe, + .remove = __devexit_p(hvc_opal_remove), + .driver = { + .name = hvc_opal_name, + .owner = THIS_MODULE, + .of_match_table = hvc_opal_match, + } +}; + +static int __init hvc_opal_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + /* Register as a vio device to receive callbacks */ + return platform_driver_register(&hvc_opal_driver); +} +module_init(hvc_opal_init); + +static void __exit hvc_opal_exit(void) +{ + platform_driver_unregister(&hvc_opal_driver); +} +module_exit(hvc_opal_exit); + +static void udbg_opal_putc(char c) +{ + unsigned int termno = hvc_opal_boot_termno; + int count = -1; + + if (c == '\n') + udbg_opal_putc('\r'); + + do { + switch(hvc_opal_boot_priv.proto) { + case HV_PROTOCOL_RAW: + count = opal_put_chars(termno, &c, 1); + break; + case HV_PROTOCOL_HVSI: + count = hvc_opal_hvsi_put_chars(termno, &c, 1); + break; + } + } while(count == 0 || count == -EAGAIN); +} + +static int udbg_opal_getc_poll(void) +{ + unsigned int termno = hvc_opal_boot_termno; + int rc = 0; + char c; + + switch(hvc_opal_boot_priv.proto) { + case HV_PROTOCOL_RAW: + rc = opal_get_chars(termno, &c, 1); + break; + case HV_PROTOCOL_HVSI: + rc = hvc_opal_hvsi_get_chars(termno, &c, 1); + break; + } + if (!rc) + return -1; + return c; +} + +static int udbg_opal_getc(void) +{ + int ch; + for (;;) { + ch = udbg_opal_getc_poll(); + if (ch == -1) { + /* This shouldn't be needed...but... */ + volatile unsigned long delay; + for (delay=0; delay < 2000000; delay++) + ; + } else { + return ch; + } + } +} + +static void udbg_init_opal_common(void) +{ + udbg_putc = udbg_opal_putc; + udbg_getc = udbg_opal_getc; + udbg_getc_poll = udbg_opal_getc_poll; + tb_ticks_per_usec = 0x200; /* Make udelay not suck */ +} + +void __init hvc_opal_init_early(void) +{ + struct device_node *stdout_node = NULL; + const u32 *termno; + const char *name = NULL; + const struct hv_ops *ops; + u32 index; + + /* find the boot console from /chosen/stdout */ + if (of_chosen) + name = of_get_property(of_chosen, "linux,stdout-path", NULL); + if (name) { + stdout_node = of_find_node_by_path(name); + if (!stdout_node) { + pr_err("hvc_opal: Failed to locate default console!\n"); + return; + } + } else { + struct device_node *opal, *np; + + /* Current OPAL takeover doesn't provide the stdout + * path, so we hard wire it + */ + opal = of_find_node_by_path("/ibm,opal/consoles"); + if (opal) + pr_devel("hvc_opal: Found consoles in new location\n"); + if (!opal) { + opal = of_find_node_by_path("/ibm,opal"); + if (opal) + pr_devel("hvc_opal: " + "Found consoles in old location\n"); + } + if (!opal) + return; + for_each_child_of_node(opal, np) { + if (!strcmp(np->name, "serial")) { + stdout_node = np; + break; + } + } + of_node_put(opal); + } + if (!stdout_node) + return; + termno = of_get_property(stdout_node, "reg", NULL); + index = termno ? *termno : 0; + if (index >= MAX_NR_HVC_CONSOLES) + return; + hvc_opal_privs[index] = &hvc_opal_boot_priv; + + /* Check the protocol */ + if (of_device_is_compatible(stdout_node, "ibm,opal-console-raw")) { + hvc_opal_boot_priv.proto = HV_PROTOCOL_RAW; + ops = &hvc_opal_raw_ops; + pr_devel("hvc_opal: Found RAW console\n"); + } + else if (of_device_is_compatible(stdout_node,"ibm,opal-console-hvsi")) { + hvc_opal_boot_priv.proto = HV_PROTOCOL_HVSI; + ops = &hvc_opal_hvsi_ops; + hvsilib_init(&hvc_opal_boot_priv.hvsi, opal_get_chars, + opal_put_chars, index, 1); + /* HVSI, perform the handshake now */ + hvsilib_establish(&hvc_opal_boot_priv.hvsi); + pr_devel("hvc_opal: Found HVSI console\n"); + } else + goto out; + hvc_opal_boot_termno = index; + udbg_init_opal_common(); + add_preferred_console("hvc", index, NULL); + hvc_instantiate(index, index, ops); +out: + of_node_put(stdout_node); +} + +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL_RAW +void __init udbg_init_debug_opal(void) +{ + u32 index = CONFIG_PPC_EARLY_DEBUG_OPAL_VTERMNO; + hvc_opal_privs[index] = &hvc_opal_boot_priv; + hvc_opal_boot_priv.proto = HV_PROTOCOL_RAW; + hvc_opal_boot_termno = index; + udbg_init_opal_common(); +} +#endif /* CONFIG_PPC_EARLY_DEBUG_OPAL_RAW */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI +void __init udbg_init_debug_opal_hvsi(void) +{ + u32 index = CONFIG_PPC_EARLY_DEBUG_OPAL_VTERMNO; + hvc_opal_privs[index] = &hvc_opal_boot_priv; + hvc_opal_boot_termno = index; + udbg_init_opal_common(); + hvsilib_init(&hvc_opal_boot_priv.hvsi, opal_get_chars, opal_put_chars, + index, 1); + hvsilib_establish(&hvc_opal_boot_priv.hvsi); +} +#endif /* CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI */ diff --git a/drivers/tty/hvc/hvsi_lib.c b/drivers/tty/hvc/hvsi_lib.c index bd9b09827b2..6f4dd83d869 100644 --- a/drivers/tty/hvc/hvsi_lib.c +++ b/drivers/tty/hvc/hvsi_lib.c @@ -183,7 +183,7 @@ int hvsilib_get_chars(struct hvsi_priv *pv, char *buf, int count) unsigned int tries, read = 0; if (WARN_ON(!pv)) - return 0; + return -ENXIO; /* If we aren't open, don't do anything in order to avoid races * with connection establishment. The hvc core will call this @@ -234,7 +234,7 @@ int hvsilib_put_chars(struct hvsi_priv *pv, const char *buf, int count) int rc, adjcount = min(count, HVSI_MAX_OUTGOING_DATA); if (WARN_ON(!pv)) - return 0; + return -ENODEV; dp.hdr.type = VS_DATA_PACKET_HEADER; dp.hdr.len = adjcount + sizeof(struct hvsi_header); -- cgit v1.2.3-70-g09d2 From 628daa8d5abfd904a7329a660c5c374212230123 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:45:01 +0000 Subject: powerpc/powernv: Add RTC and NVRAM support plus RTAS fallbacks Implements OPAL RTC and NVRAM support and wire all that up to the powernv platform. We use RTAS for RTC as a fallback if available. Using RTAS for nvram is not supported yet, pending some rework/cleanup and generalization of the pSeries & CHRP code. We also use RTAS fallbacks for power off and reboot Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/opal.h | 6 ++ arch/powerpc/platforms/powernv/Makefile | 2 + arch/powerpc/platforms/powernv/opal-nvram.c | 88 ++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/opal-rtc.c | 97 +++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/setup.c | 57 ++++++++++------- 5 files changed, 229 insertions(+), 21 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-nvram.c create mode 100644 arch/powerpc/platforms/powernv/opal-rtc.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 749de00a02d..77ebe50020a 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -430,6 +430,12 @@ extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); extern void hvc_opal_init_early(void); +struct rtc_time; +extern int opal_set_rtc_time(struct rtc_time *tm); +extern void opal_get_rtc_time(struct rtc_time *tm); +extern unsigned long opal_get_boot_time(void); +extern void opal_nvram_init(void); + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_H */ diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 8f69c0db612..618ad836f28 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,2 +1,4 @@ obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o +obj-y += opal-rtc.o opal-nvram.o + obj-$(CONFIG_SMP) += smp.o diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c new file mode 100644 index 00000000000..3f83e1ae26a --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -0,0 +1,88 @@ +/* + * PowerNV nvram code. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include +#include +#include + +#include +#include + +static unsigned int nvram_size; + +static ssize_t opal_nvram_size(void) +{ + return nvram_size; +} + +static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index) +{ + s64 rc; + int off; + + if (*index >= nvram_size) + return 0; + off = *index; + if ((off + count) > nvram_size) + count = nvram_size - off; + rc = opal_read_nvram(__pa(buf), count, off); + if (rc != OPAL_SUCCESS) + return -EIO; + *index += count; + return count; +} + +static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) +{ + s64 rc = OPAL_BUSY; + int off; + + if (*index >= nvram_size) + return 0; + off = *index; + if ((off + count) > nvram_size) + count = nvram_size - off; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_write_nvram(__pa(buf), count, off); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + } + *index += count; + return count; +} + +void __init opal_nvram_init(void) +{ + struct device_node *np; + const u32 *nbytes_p; + + np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram"); + if (np == NULL) + return; + + nbytes_p = of_get_property(np, "#bytes", NULL); + if (!nbytes_p) { + of_node_put(np); + return; + } + nvram_size = *nbytes_p; + + printk(KERN_INFO "OPAL nvram setup, %u bytes\n", nvram_size); + of_node_put(np); + + ppc_md.nvram_read = opal_nvram_read; + ppc_md.nvram_write = opal_nvram_write; + ppc_md.nvram_size = opal_nvram_size; +} + diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c new file mode 100644 index 00000000000..2aa7641aac9 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -0,0 +1,97 @@ +/* + * PowerNV Real Time Clock. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include +#include +#include +#include + +#include +#include + +static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) +{ + tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) + + bcd2bin((y_m_d >> 16) & 0xff)) - 1900; + tm->tm_mon = bcd2bin((y_m_d >> 8) & 0xff) - 1; + tm->tm_mday = bcd2bin(y_m_d & 0xff); + tm->tm_hour = bcd2bin((h_m_s_ms >> 56) & 0xff); + tm->tm_min = bcd2bin((h_m_s_ms >> 48) & 0xff); + tm->tm_sec = bcd2bin((h_m_s_ms >> 40) & 0xff); + + GregorianDay(tm); +} + +unsigned long __init opal_get_boot_time(void) +{ + struct rtc_time tm; + u32 y_m_d; + u64 h_m_s_ms; + long rc = OPAL_BUSY; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_rtc_read(&y_m_d, &h_m_s_ms); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + if (rc != OPAL_SUCCESS) + return 0; + opal_to_tm(y_m_d, h_m_s_ms, &tm); + return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); +} + +void opal_get_rtc_time(struct rtc_time *tm) +{ + long rc = OPAL_BUSY; + u32 y_m_d; + u64 h_m_s_ms; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_rtc_read(&y_m_d, &h_m_s_ms); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + if (rc != OPAL_SUCCESS) + return; + opal_to_tm(y_m_d, h_m_s_ms, tm); +} + +int opal_set_rtc_time(struct rtc_time *tm) +{ + long rc = OPAL_BUSY; + u32 y_m_d = 0; + u64 h_m_s_ms = 0; + + y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) / 100)) << 24; + y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) % 100)) << 16; + y_m_d |= ((u32)bin2bcd((tm->tm_mon + 1))) << 8; + y_m_d |= ((u32)bin2bcd(tm->tm_mday)); + + h_m_s_ms |= ((u64)bin2bcd(tm->tm_hour)) << 56; + h_m_s_ms |= ((u64)bin2bcd(tm->tm_min)) << 48; + h_m_s_ms |= ((u64)bin2bcd(tm->tm_sec)) << 40; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_rtc_write(y_m_d, h_m_s_ms); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + return rc == OPAL_SUCCESS ? 0 : -EIO; +} diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 0fac0a6c951..4a2b2e27959 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include "powernv.h" @@ -40,7 +42,9 @@ static void __init pnv_setup_arch(void) /* XXX PCI */ - /* XXX NVRAM */ + /* Setup RTC and NVRAM callbacks */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + opal_nvram_init(); /* Enable NAP mode */ powersave_nap = 1; @@ -118,30 +122,40 @@ static void __noreturn pnv_halt(void) pnv_power_off(); } -static unsigned long __init pnv_get_boot_time(void) -{ - return 0; -} - -static void pnv_get_rtc_time(struct rtc_time *rtc_tm) +static void pnv_progress(char *s, unsigned short hex) { } -static int pnv_set_rtc_time(struct rtc_time *tm) +#ifdef CONFIG_KEXEC +static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { - return 0; + xics_kexec_teardown_cpu(secondary); } +#endif /* CONFIG_KEXEC */ -static void pnv_progress(char *s, unsigned short hex) +static void __init pnv_setup_machdep_opal(void) { + ppc_md.get_boot_time = opal_get_boot_time; + ppc_md.get_rtc_time = opal_get_rtc_time; + ppc_md.set_rtc_time = opal_set_rtc_time; + ppc_md.restart = pnv_restart; + ppc_md.power_off = pnv_power_off; + ppc_md.halt = pnv_halt; } -#ifdef CONFIG_KEXEC -static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) +#ifdef CONFIG_PPC_POWERNV_RTAS +static void __init pnv_setup_machdep_rtas(void) { - xics_kexec_teardown_cpu(secondary); + if (rtas_token("get-time-of-day") != RTAS_UNKNOWN_SERVICE) { + ppc_md.get_boot_time = rtas_get_boot_time; + ppc_md.get_rtc_time = rtas_get_rtc_time; + ppc_md.set_rtc_time = rtas_set_rtc_time; + } + ppc_md.restart = rtas_restart; + ppc_md.power_off = rtas_power_off; + ppc_md.halt = rtas_halt; } -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_PPC_POWERNV_RTAS */ static int __init pnv_probe(void) { @@ -152,6 +166,13 @@ static int __init pnv_probe(void) hpte_init_native(); + if (firmware_has_feature(FW_FEATURE_OPAL)) + pnv_setup_machdep_opal(); +#ifdef CONFIG_PPC_POWERNV_RTAS + else if (rtas.base) + pnv_setup_machdep_rtas(); +#endif /* CONFIG_PPC_POWERNV_RTAS */ + pr_debug("PowerNV detected !\n"); return 1; @@ -160,16 +181,10 @@ static int __init pnv_probe(void) define_machine(powernv) { .name = "PowerNV", .probe = pnv_probe, - .setup_arch = pnv_setup_arch, .init_early = pnv_init_early, + .setup_arch = pnv_setup_arch, .init_IRQ = pnv_init_IRQ, .show_cpuinfo = pnv_show_cpuinfo, - .restart = pnv_restart, - .power_off = pnv_power_off, - .halt = pnv_halt, - .get_boot_time = pnv_get_boot_time, - .get_rtc_time = pnv_get_rtc_time, - .set_rtc_time = pnv_set_rtc_time, .progress = pnv_progress, .power_save = power7_idle, .calibrate_decr = generic_calibrate_decr, -- cgit v1.2.3-70-g09d2 From 5c7c1e9444d8bfb721a27a35bba3eeb5236c75d8 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:45:02 +0000 Subject: powerpc/powernv: Add OPAL ICS backend OPAL handles HW access to the various ICS or equivalent chips for us (with the exception of p5ioc2 based HEA which uses a different backend) similarily to what RTAS does on pSeries. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/xics.h | 19 +++ arch/powerpc/sysdev/xics/Makefile | 1 + arch/powerpc/sysdev/xics/ics-opal.c | 244 +++++++++++++++++++++++++++++++++ arch/powerpc/sysdev/xics/xics-common.c | 8 +- 4 files changed, 266 insertions(+), 6 deletions(-) create mode 100644 arch/powerpc/sysdev/xics/ics-opal.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index b183a406201..bd6c401c0ee 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -27,10 +27,18 @@ #define MAX_NUM_PRIORITIES 3 /* Native ICP */ +#ifdef CONFIG_PPC_ICP_NATIVE extern int icp_native_init(void); +#else +static inline int icp_native_init(void) { return -ENODEV; } +#endif /* PAPR ICP */ +#ifdef CONFIG_PPC_ICP_HV extern int icp_hv_init(void); +#else +static inline int icp_hv_init(void) { return -ENODEV; } +#endif /* ICP ops */ struct icp_ops { @@ -51,7 +59,18 @@ extern const struct icp_ops *icp_ops; extern int ics_native_init(void); /* RTAS ICS */ +#ifdef CONFIG_PPC_ICS_RTAS extern int ics_rtas_init(void); +#else +static inline int ics_rtas_init(void) { return -ENODEV; } +#endif + +/* HAL ICS */ +#ifdef CONFIG_PPC_POWERNV +extern int ics_opal_init(void); +#else +static inline int ics_opal_init(void) { return -ENODEV; } +#endif /* ICS instance, hooked up to chip_data of an irq */ struct ics { diff --git a/arch/powerpc/sysdev/xics/Makefile b/arch/powerpc/sysdev/xics/Makefile index b75a6059337..c606aa8ba60 100644 --- a/arch/powerpc/sysdev/xics/Makefile +++ b/arch/powerpc/sysdev/xics/Makefile @@ -4,3 +4,4 @@ obj-y += xics-common.o obj-$(CONFIG_PPC_ICP_NATIVE) += icp-native.o obj-$(CONFIG_PPC_ICP_HV) += icp-hv.o obj-$(CONFIG_PPC_ICS_RTAS) += ics-rtas.o +obj-$(CONFIG_PPC_POWERNV) += ics-opal.o diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c new file mode 100644 index 00000000000..f7e8609df0d --- /dev/null +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -0,0 +1,244 @@ +/* + * ICS backend for OPAL managed interrupts. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static int ics_opal_mangle_server(int server) +{ + /* No link for now */ + return server << 2; +} + +static int ics_opal_unmangle_server(int server) +{ + /* No link for now */ + return server >> 2; +} + +static void ics_opal_unmask_irq(struct irq_data *d) +{ + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int64_t rc; + int server; + + pr_devel("ics-hal: unmask virq %d [hw 0x%x]\n", d->irq, hw_irq); + + if (hw_irq == XICS_IPI || hw_irq == XICS_IRQ_SPURIOUS) + return; + + server = xics_get_irq_server(d->irq, d->affinity, 0); + server = ics_opal_mangle_server(server); + + rc = opal_set_xive(hw_irq, server, DEFAULT_PRIORITY); + if (rc != OPAL_SUCCESS) + pr_err("%s: opal_set_xive(irq=%d [hw 0x%x] server=%x)" + " error %lld\n", + __func__, d->irq, hw_irq, server, rc); +} + +static unsigned int ics_opal_startup(struct irq_data *d) +{ +#ifdef CONFIG_PCI_MSI + /* + * The generic MSI code returns with the interrupt disabled on the + * card, using the MSI mask bits. Firmware doesn't appear to unmask + * at that level, so we do it here by hand. + */ + if (d->msi_desc) + unmask_msi_irq(d); +#endif + + /* unmask it */ + ics_opal_unmask_irq(d); + return 0; +} + +static void ics_opal_mask_real_irq(unsigned int hw_irq) +{ + int server = ics_opal_mangle_server(xics_default_server); + int64_t rc; + + if (hw_irq == XICS_IPI) + return; + + /* Have to set XIVE to 0xff to be able to remove a slot */ + rc = opal_set_xive(hw_irq, server, 0xff); + if (rc != OPAL_SUCCESS) + pr_err("%s: opal_set_xive(0xff) irq=%u returned %lld\n", + __func__, hw_irq, rc); +} + +static void ics_opal_mask_irq(struct irq_data *d) +{ + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + pr_devel("ics-hal: mask virq %d [hw 0x%x]\n", d->irq, hw_irq); + + if (hw_irq == XICS_IPI || hw_irq == XICS_IRQ_SPURIOUS) + return; + ics_opal_mask_real_irq(hw_irq); +} + +static int ics_opal_set_affinity(struct irq_data *d, + const struct cpumask *cpumask, + bool force) +{ + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int16_t server; + int8_t priority; + int64_t rc; + int wanted_server; + + if (hw_irq == XICS_IPI || hw_irq == XICS_IRQ_SPURIOUS) + return -1; + + rc = opal_get_xive(hw_irq, &server, &priority); + if (rc != OPAL_SUCCESS) { + pr_err("%s: opal_set_xive(irq=%d [hw 0x%x] server=%x)" + " error %lld\n", + __func__, d->irq, hw_irq, server, rc); + return -1; + } + + wanted_server = xics_get_irq_server(d->irq, cpumask, 1); + if (wanted_server < 0) { + char cpulist[128]; + cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask); + pr_warning("%s: No online cpus in the mask %s for irq %d\n", + __func__, cpulist, d->irq); + return -1; + } + server = ics_opal_mangle_server(wanted_server); + + pr_devel("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n", + d->irq, hw_irq, wanted_server, server); + + rc = opal_set_xive(hw_irq, server, priority); + if (rc != OPAL_SUCCESS) { + pr_err("%s: opal_set_xive(irq=%d [hw 0x%x] server=%x)" + " error %lld\n", + __func__, d->irq, hw_irq, server, rc); + return -1; + } + return 0; +} + +static struct irq_chip ics_opal_irq_chip = { + .name = "OPAL ICS", + .irq_startup = ics_opal_startup, + .irq_mask = ics_opal_mask_irq, + .irq_unmask = ics_opal_unmask_irq, + .irq_eoi = NULL, /* Patched at init time */ + .irq_set_affinity = ics_opal_set_affinity +}; + +static int ics_opal_map(struct ics *ics, unsigned int virq); +static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec); +static long ics_opal_get_server(struct ics *ics, unsigned long vec); + +static int ics_opal_host_match(struct ics *ics, struct device_node *node) +{ + return 1; +} + +/* Only one global & state struct ics */ +static struct ics ics_hal = { + .map = ics_opal_map, + .mask_unknown = ics_opal_mask_unknown, + .get_server = ics_opal_get_server, + .host_match = ics_opal_host_match, +}; + +static int ics_opal_map(struct ics *ics, unsigned int virq) +{ + unsigned int hw_irq = (unsigned int)virq_to_hw(virq); + int64_t rc; + int16_t server; + int8_t priority; + + if (WARN_ON(hw_irq == XICS_IPI || hw_irq == XICS_IRQ_SPURIOUS)) + return -EINVAL; + + /* Check if HAL knows about this interrupt */ + rc = opal_get_xive(hw_irq, &server, &priority); + if (rc != OPAL_SUCCESS) + return -ENXIO; + + irq_set_chip_and_handler(virq, &ics_opal_irq_chip, handle_fasteoi_irq); + irq_set_chip_data(virq, &ics_hal); + + return 0; +} + +static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec) +{ + int64_t rc; + int16_t server; + int8_t priority; + + /* Check if HAL knows about this interrupt */ + rc = opal_get_xive(vec, &server, &priority); + if (rc != OPAL_SUCCESS) + return; + + ics_opal_mask_real_irq(vec); +} + +static long ics_opal_get_server(struct ics *ics, unsigned long vec) +{ + int64_t rc; + int16_t server; + int8_t priority; + + /* Check if HAL knows about this interrupt */ + rc = opal_get_xive(vec, &server, &priority); + if (rc != OPAL_SUCCESS) + return -1; + return ics_opal_unmangle_server(server); +} + +int __init ics_opal_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + /* We need to patch our irq chip's EOI to point to the + * right ICP + */ + ics_opal_irq_chip.irq_eoi = icp_ops->eoi; + + /* Register ourselves */ + xics_register_ics(&ics_hal); + + pr_info("ICS OPAL backend registered\n"); + + return 0; +} diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 445c5a01b76..3d93a8ded0f 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -409,14 +409,10 @@ void __init xics_init(void) int rc = -1; /* Fist locate ICP */ -#ifdef CONFIG_PPC_ICP_HV if (firmware_has_feature(FW_FEATURE_LPAR)) rc = icp_hv_init(); -#endif -#ifdef CONFIG_PPC_ICP_NATIVE if (rc < 0) rc = icp_native_init(); -#endif if (rc < 0) { pr_warning("XICS: Cannot find a Presentation Controller !\n"); return; @@ -429,9 +425,9 @@ void __init xics_init(void) xics_ipi_chip.irq_eoi = icp_ops->eoi; /* Now locate ICS */ -#ifdef CONFIG_PPC_ICS_RTAS rc = ics_rtas_init(); -#endif + if (rc < 0) + rc = ics_opal_init(); if (rc < 0) pr_warning("XICS: Cannot find a Source Controller !\n"); -- cgit v1.2.3-70-g09d2 From ed79ba9e15f84cef05aba5cbfe6e93f9b43c31f4 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:45:04 +0000 Subject: powerpc/powernv: Machine check and other system interrupts OPAL can handle various interrupt for us such as Machine Checks (it performs all sorts of recovery tasks and passes back control to us with informations about the error), Hardware Management Interrupts and Softpatch interrupts. This wires up the mechanisms and prints out specific informations returned by HAL when a machine check occurs. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/opal.h | 2 + arch/powerpc/include/asm/paca.h | 8 ++ arch/powerpc/kernel/asm-offsets.c | 10 +++ arch/powerpc/kernel/exceptions-64s.S | 27 ++++++- arch/powerpc/platforms/powernv/opal.c | 130 +++++++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/setup.c | 1 + 6 files changed, 174 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 77ebe50020a..2893e8f5406 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -436,6 +436,8 @@ extern void opal_get_rtc_time(struct rtc_time *tm); extern unsigned long opal_get_boot_time(void); extern void opal_nvram_init(void); +extern int opal_machine_check(struct pt_regs *regs); + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 516bfb3f47d..17722c73ba2 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -43,6 +43,7 @@ extern unsigned int debug_smp_processor_id(void); /* from linux/smp.h */ #define get_slb_shadow() (get_paca()->slb_shadow_ptr) struct task_struct; +struct opal_machine_check_event; /* * Defines the layout of the paca. @@ -135,6 +136,13 @@ struct paca_struct { u8 io_sync; /* writel() needs spin_unlock sync */ u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ +#ifdef CONFIG_PPC_POWERNV + /* Pointer to OPAL machine check event structure set by the + * early exception handler for use by high level C handler + */ + struct opal_machine_check_event *opal_mc_evt; +#endif + /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ u64 system_time; /* accumulated system TB ticks */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 5f078bc2063..536ffa897c6 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -48,6 +48,9 @@ #ifdef CONFIG_PPC_ISERIES #include #endif +#ifdef CONFIG_PPC_POWERNV +#include +#endif #if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST) #include #endif @@ -609,5 +612,12 @@ int main(void) arch.timing_last_enter.tv32.tbl)); #endif +#ifdef CONFIG_PPC_POWERNV + DEFINE(OPAL_MC_GPR3, offsetof(struct opal_machine_check_event, gpr3)); + DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0)); + DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1)); + DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt)); +#endif + return 0; } diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 41b02c792aa..d51458fa8de 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1143,7 +1143,7 @@ _GLOBAL(do_stab_bolted) rfid b . /* prevent speculative execution */ -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. * This address (0x7000) is fixed by the RPA. @@ -1151,7 +1151,7 @@ _GLOBAL(do_stab_bolted) .= 0x7000 .globl fwnmi_data_area fwnmi_data_area: -#endif /* CONFIG_PPC_PSERIES */ +#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ /* iSeries does not use the FWNMI stuff, so it is safe to put * this here, even if we later allow kernels that will boot on @@ -1176,9 +1176,12 @@ xLparMap: #endif /* CONFIG_PPC_ISERIES */ -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) + /* pseries and powernv need to keep the whole page from + * 0x7000 to 0x8000 free for use by the firmware + */ . = 0x8000 -#endif /* CONFIG_PPC_PSERIES */ +#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ /* * Space for CPU0's segment table. @@ -1193,3 +1196,19 @@ xLparMap: .globl initial_stab initial_stab: .space 4096 +#ifdef CONFIG_PPC_POWERNV +_GLOBAL(opal_mc_secondary_handler) + HMT_MEDIUM + SET_SCRATCH0(r13) + GET_PACA(r13) + clrldi r3,r3,2 + tovirt(r3,r3) + std r3,PACA_OPAL_MC_EVT(r13) + ld r13,OPAL_MC_SRR0(r3) + mtspr SPRN_SRR0,r13 + ld r13,OPAL_MC_SRR1(r3) + mtspr SPRN_SRR1,r13 + ld r3,OPAL_MC_GPR3(r3) + GET_SCRATCH0(r13) + b machine_check_pSeries +#endif /* CONFIG_PPC_POWERNV */ diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 5a598caefcb..aaa0dba4947 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -27,12 +27,14 @@ struct opal { static struct device_node *opal_node; static DEFINE_SPINLOCK(opal_write_lock); +extern u64 opal_mc_secondary_handler[]; int __init early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data) { const void *basep, *entryp; unsigned long basesz, entrysz; + u64 glue; if (depth != 1 || strcmp(uname, "ibm,opal") != 0) return 0; @@ -59,6 +61,19 @@ int __init early_init_dt_scan_opal(unsigned long node, printk("OPAL V1 detected !\n"); } + /* Hookup some exception handlers. We use the fwnmi area at 0x7000 + * to provide the glue space to OPAL + */ + glue = 0x7000; + opal_register_exception_handler(OPAL_MACHINE_CHECK_HANDLER, + __pa(opal_mc_secondary_handler[0]), + glue); + glue += 128; + opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER, + 0, glue); + glue += 128; + opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); + return 1; } @@ -136,6 +151,121 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) return written; } +int opal_machine_check(struct pt_regs *regs) +{ + struct opal_machine_check_event *opal_evt = get_paca()->opal_mc_evt; + struct opal_machine_check_event evt; + const char *level, *sevstr, *subtype; + static const char *opal_mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + static const char *opal_mc_slb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *opal_mc_erat_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *opal_mc_tlb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + + /* Copy the event structure and release the original */ + evt = *opal_evt; + opal_evt->in_use = 0; + + /* Print things out */ + if (evt.version != OpalMCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; + } + switch(evt.severity) { + case OpalMCE_SEV_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case OpalMCE_SEV_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case OpalMCE_SEV_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case OpalMCE_SEV_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + evt.disposition == OpalMCE_DISPOSITION_RECOVERED ? + "Recovered" : "[Not recovered"); + printk("%s Initiator: %s\n", level, + evt.initiator == OpalMCE_INITIATOR_CPU ? "CPU" : "Unknown"); + switch(evt.error_type) { + case OpalMCE_ERROR_TYPE_UE: + subtype = evt.u.ue_error.ue_error_type < + ARRAY_SIZE(opal_mc_ue_types) ? + opal_mc_ue_types[evt.u.ue_error.ue_error_type] + : "Unknown"; + printk("%s Error type: UE [%s]\n", level, subtype); + if (evt.u.ue_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.ue_error.effective_address); + if (evt.u.ue_error.physical_address_provided) + printk("%s Physial address: %016llx\n", + level, evt.u.ue_error.physical_address); + break; + case OpalMCE_ERROR_TYPE_SLB: + subtype = evt.u.slb_error.slb_error_type < + ARRAY_SIZE(opal_mc_slb_types) ? + opal_mc_slb_types[evt.u.slb_error.slb_error_type] + : "Unknown"; + printk("%s Error type: SLB [%s]\n", level, subtype); + if (evt.u.slb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.slb_error.effective_address); + break; + case OpalMCE_ERROR_TYPE_ERAT: + subtype = evt.u.erat_error.erat_error_type < + ARRAY_SIZE(opal_mc_erat_types) ? + opal_mc_erat_types[evt.u.erat_error.erat_error_type] + : "Unknown"; + printk("%s Error type: ERAT [%s]\n", level, subtype); + if (evt.u.erat_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.erat_error.effective_address); + break; + case OpalMCE_ERROR_TYPE_TLB: + subtype = evt.u.tlb_error.tlb_error_type < + ARRAY_SIZE(opal_mc_tlb_types) ? + opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type] + : "Unknown"; + printk("%s Error type: TLB [%s]\n", level, subtype); + if (evt.u.tlb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.tlb_error.effective_address); + break; + default: + case OpalMCE_ERROR_TYPE_UNKNOWN: + printk("%s Error type: Unknown\n", level); + break; + } + return evt.severity == OpalMCE_SEV_FATAL ? 0 : 1; +} + static irqreturn_t opal_interrupt(int irq, void *data) { uint64_t events; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 4a2b2e27959..f0242f3fd3e 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -141,6 +141,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.restart = pnv_restart; ppc_md.power_off = pnv_power_off; ppc_md.halt = pnv_halt; + ppc_md.machine_check_exception = opal_machine_check; } #ifdef CONFIG_PPC_POWERNV_RTAS -- cgit v1.2.3-70-g09d2 From 37caf9f2a1b99d11ba71e17168d221da9ca13f24 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Sat, 27 Aug 2011 06:14:23 -0500 Subject: powerpc/fsl-booke: Handle L1 D-cache parity error correctly on e500mc If the L1 D-Cache is in write shadow mode the HW will auto-recover the error. However we might still log the error and cause a machine check (if L1CSR0[CPE] - Cache error checking enable). We should only treat the non-write shadow case as non-recoverable. Signed-off-by: Kumar Gala --- arch/powerpc/include/asm/reg_booke.h | 3 +++ arch/powerpc/kernel/traps.c | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 9ec0b39f9dd..28cdbd9f399 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -548,6 +548,9 @@ #define L1CSR1_ICFI 0x00000002 /* Instr Cache Flash Invalidate */ #define L1CSR1_ICE 0x00000001 /* Instr Cache Enable */ +/* Bit definitions for L1CSR2. */ +#define L1CSR2_DCWS 0x40000000 /* Data Cache write shadow */ + /* Bit definitions for L2CSR0. */ #define L2CSR0_L2E 0x80000000 /* L2 Cache Enable */ #define L2CSR0_L2PE 0x40000000 /* L2 Cache Parity/ECC Enable */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f19d9777d3c..4e5908264d1 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -457,7 +457,14 @@ int machine_check_e500mc(struct pt_regs *regs) if (reason & MCSR_DCPERR_MC) { printk("Data Cache Parity Error\n"); - recoverable = 0; + + /* + * In write shadow mode we auto-recover from the error, but it + * may still get logged and cause a machine check. We should + * only treat the non-write shadow case as non-recoverable. + */ + if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) + recoverable = 0; } if (reason & MCSR_L2MMU_MHIT) { -- cgit v1.2.3-70-g09d2