diff options
Diffstat (limited to 'arch/powerpc')
34 files changed, 4606 insertions, 177 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c523029674e..94df74bcc0e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -261,7 +261,7 @@ config PPC_ISERIES config EMBEDDED6xx bool "Embedded 6xx/7xx/7xxx-based board" - depends on PPC32 + depends on PPC32 && BROKEN config APUS bool "Amiga-APUS" @@ -305,7 +305,7 @@ config PPC_PMAC64 config PPC_PREP bool " PowerPC Reference Platform (PReP) based machines" - depends on PPC_MULTIPLATFORM && PPC32 + depends on PPC_MULTIPLATFORM && PPC32 && BROKEN select PPC_I8259 select PPC_INDIRECT_PCI default y @@ -932,6 +932,7 @@ source "arch/powerpc/oprofile/Kconfig" config KPROBES bool "Kprobes (EXPERIMENTAL)" + depends on PPC64 help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 5bc11bd36c1..d41ad2e675d 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -187,7 +187,7 @@ archprepare: checkbin # Temporary hack until we have migrated to asm-powerpc include/asm: arch/$(ARCH)/include/asm -arch/$(ARCH)/include/asm: +arch/$(ARCH)/include/asm: FORCE $(Q)if [ ! -d arch/$(ARCH)/include ]; then mkdir -p arch/$(ARCH)/include; fi $(Q)ln -fsn $(srctree)/include/asm-$(OLDARCH) arch/$(ARCH)/include/asm diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 9f09dff9e11..913962c1dae 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -1,18 +1,33 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-rc4 -# Thu Oct 20 08:32:17 2005 +# Linux kernel version: 2.6.15-rc1 +# Mon Nov 14 15:27:00 2005 # +CONFIG_PPC64=y CONFIG_64BIT=y +CONFIG_PPC_MERGE=y CONFIG_MMU=y +CONFIG_GENERIC_HARDIRQS=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_GENERIC_ISA_DMA=y +CONFIG_PPC=y CONFIG_EARLY_PRINTK=y CONFIG_COMPAT=y +CONFIG_SYSVIPC_COMPAT=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_FORCE_MAX_ZONEORDER=13 + +# +# Processor support +# +# CONFIG_POWER4_ONLY is not set +CONFIG_POWER3=y +CONFIG_POWER4=y +CONFIG_PPC_FPU=y +CONFIG_ALTIVEC=y +CONFIG_PPC_STD_MMU=y +CONFIG_SMP=y +CONFIG_NR_CPUS=128 # # Code maturity level options @@ -68,75 +83,103 @@ CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_KMOD=y CONFIG_STOP_MACHINE=y -CONFIG_SYSVIPC_COMPAT=y + +# +# Block layer +# + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Platform support # -# CONFIG_PPC_ISERIES is not set CONFIG_PPC_MULTIPLATFORM=y +# CONFIG_PPC_ISERIES is not set +# CONFIG_EMBEDDED6xx is not set +# CONFIG_APUS is not set CONFIG_PPC_PSERIES=y -# CONFIG_PPC_BPA is not set # CONFIG_PPC_PMAC is not set # CONFIG_PPC_MAPLE is not set -CONFIG_PPC=y -CONFIG_PPC64=y +# CONFIG_PPC_CELL is not set CONFIG_PPC_OF=y CONFIG_XICS=y +# CONFIG_U3_DART is not set CONFIG_MPIC=y -CONFIG_ALTIVEC=y -CONFIG_PPC_SPLPAR=y -CONFIG_KEXEC=y +CONFIG_PPC_RTAS=y +CONFIG_RTAS_ERROR_LOGGING=y +CONFIG_RTAS_PROC=y +CONFIG_RTAS_FLASH=m +# CONFIG_MMIO_NVRAM is not set CONFIG_IBMVIO=y -# CONFIG_U3_DART is not set -# CONFIG_BOOTX_TEXT is not set -# CONFIG_POWER4_ONLY is not set +# CONFIG_PPC_MPC106 is not set +# CONFIG_GENERIC_TBSYNC is not set +# CONFIG_CPU_FREQ is not set +# CONFIG_WANT_EARLY_SERIAL is not set + +# +# Kernel options +# +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_BKL is not set +CONFIG_BINFMT_ELF=y +# CONFIG_BINFMT_MISC is not set +CONFIG_FORCE_MAX_ZONEORDER=13 CONFIG_IOMMU_VMERGE=y -CONFIG_SMP=y -CONFIG_NR_CPUS=128 +CONFIG_HOTPLUG_CPU=y +CONFIG_KEXEC=y +# CONFIG_IRQ_ALL_CPUS is not set +CONFIG_PPC_SPLPAR=y +CONFIG_EEH=y +CONFIG_SCANLOG=m +CONFIG_LPARCFG=y +CONFIG_NUMA=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_FLATMEM_ENABLE=y -CONFIG_ARCH_DISCONTIGMEM_ENABLE=y -CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y CONFIG_SELECT_MEMORY_MODEL=y # CONFIG_FLATMEM_MANUAL is not set -CONFIG_DISCONTIGMEM_MANUAL=y -# CONFIG_SPARSEMEM_MANUAL is not set -CONFIG_DISCONTIGMEM=y -CONFIG_FLAT_NODE_MEM_MAP=y +# CONFIG_DISCONTIGMEM_MANUAL is not set +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPARSEMEM_EXTREME=y +# CONFIG_MEMORY_HOTPLUG is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_NODES_SPAN_OTHER_NODES=y -CONFIG_NUMA=y +# CONFIG_PPC_64K_PAGES is not set CONFIG_SCHED_SMT=y -CONFIG_PREEMPT_NONE=y -# CONFIG_PREEMPT_VOLUNTARY is not set -# CONFIG_PREEMPT is not set -# CONFIG_PREEMPT_BKL is not set -# CONFIG_HZ_100 is not set -CONFIG_HZ_250=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=250 -CONFIG_EEH=y -CONFIG_GENERIC_HARDIRQS=y -CONFIG_PPC_RTAS=y -CONFIG_RTAS_PROC=y -CONFIG_RTAS_FLASH=m -CONFIG_SCANLOG=m -CONFIG_LPARCFG=y -CONFIG_SECCOMP=y -CONFIG_BINFMT_ELF=y -# CONFIG_BINFMT_MISC is not set -CONFIG_HOTPLUG_CPU=y CONFIG_PROC_DEVICETREE=y # CONFIG_CMDLINE_BOOL is not set +# CONFIG_PM is not set +CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y # -# Bus Options +# Bus options # +CONFIG_GENERIC_ISA_DMA=y +CONFIG_PPC_I8259=y +# CONFIG_PPC_INDIRECT_PCI is not set CONFIG_PCI=y CONFIG_PCI_DOMAINS=y CONFIG_PCI_LEGACY_PROC=y @@ -156,6 +199,7 @@ CONFIG_HOTPLUG_PCI=m # CONFIG_HOTPLUG_PCI_SHPC is not set CONFIG_HOTPLUG_PCI_RPA=m CONFIG_HOTPLUG_PCI_RPA_DLPAR=m +CONFIG_KERNEL_START=0xc000000000000000 # # Networking @@ -197,6 +241,10 @@ CONFIG_TCP_CONG_BIC=y # CONFIG_IPV6 is not set CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set + +# +# Core Netfilter Configuration +# CONFIG_NETFILTER_NETLINK=y CONFIG_NETFILTER_NETLINK_QUEUE=m CONFIG_NETFILTER_NETLINK_LOG=m @@ -299,6 +347,10 @@ CONFIG_LLC=y # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set + +# +# QoS and/or fair queueing +# # CONFIG_NET_SCHED is not set CONFIG_NET_CLS_ROUTE=y @@ -368,14 +420,6 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=65536 CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y # CONFIG_ATA_OVER_ETH is not set # @@ -473,6 +517,7 @@ CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers # +# CONFIG_ISCSI_TCP is not set # CONFIG_BLK_DEV_3W_XXXX_RAID is not set # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set @@ -559,6 +604,7 @@ CONFIG_DM_MULTIPATH_EMC=m # # Macintosh device drivers # +# CONFIG_WINDFARM is not set # # Network device support @@ -645,7 +691,6 @@ CONFIG_IXGB=m # CONFIG_IXGB_NAPI is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set -# CONFIG_2BUFF_MODE is not set # # Token Ring devices @@ -674,6 +719,7 @@ CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m +# CONFIG_PPP_MPPE is not set CONFIG_PPPOE=m # CONFIG_SLIP is not set # CONFIG_NET_FC is not set @@ -784,6 +830,8 @@ CONFIG_HVCS=m # # CONFIG_WATCHDOG is not set # CONFIG_RTC is not set +CONFIG_GEN_RTC=y +# CONFIG_GEN_RTC_X is not set # CONFIG_DTLK is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set @@ -801,6 +849,7 @@ CONFIG_MAX_RAW_DEVS=1024 # TPM devices # # CONFIG_TCG_TPM is not set +# CONFIG_TELCLOCK is not set # # I2C support @@ -852,6 +901,7 @@ CONFIG_I2C_ALGOBIT=y # CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_RTC8564 is not set # CONFIG_SENSORS_MAX6875 is not set +# CONFIG_RTC_X1205_I2C is not set # CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_ALGO is not set # CONFIG_I2C_DEBUG_BUS is not set @@ -893,7 +943,6 @@ CONFIG_FB=y CONFIG_FB_CFB_FILLRECT=y CONFIG_FB_CFB_COPYAREA=y CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SOFT_CURSOR=y CONFIG_FB_MACMODES=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y @@ -905,6 +954,7 @@ CONFIG_FB_OF=y # CONFIG_FB_ASILIANT is not set # CONFIG_FB_IMSTT is not set # CONFIG_FB_VGA16 is not set +# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_NVIDIA is not set # CONFIG_FB_RIVA is not set CONFIG_FB_MATROX=y @@ -927,7 +977,6 @@ CONFIG_FB_RADEON_I2C=y # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_CYBLA is not set # CONFIG_FB_TRIDENT is not set -# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_VIRTUAL is not set # @@ -936,6 +985,7 @@ CONFIG_FB_RADEON_I2C=y # CONFIG_VGA_CONSOLE is not set CONFIG_DUMMY_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE=y +# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set # CONFIG_FONTS is not set CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y @@ -990,12 +1040,15 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y # # USB Device Class drivers # -# CONFIG_USB_BLUETOOTH_TTY is not set # CONFIG_USB_ACM is not set # CONFIG_USB_PRINTER is not set # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# may also be needed; see USB_STORAGE Help for more information # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1106,6 +1159,7 @@ CONFIG_INFINIBAND_MTHCA=m # CONFIG_INFINIBAND_MTHCA_DEBUG is not set CONFIG_INFINIBAND_IPOIB=m # CONFIG_INFINIBAND_IPOIB_DEBUG is not set +# CONFIG_INFINIBAND_SRP is not set # # SN Devices @@ -1288,10 +1342,25 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_NLS_UTF8 is not set # -# Profiling support +# Library routines +# +CONFIG_CRC_CCITT=m +# CONFIG_CRC16 is not set +CONFIG_CRC32=y +CONFIG_LIBCRC32C=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m + +# +# Instrumentation Support # CONFIG_PROFILING=y CONFIG_OPROFILE=y +# CONFIG_KPROBES is not set # # Kernel hacking @@ -1308,14 +1377,15 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set CONFIG_DEBUG_FS=y +# CONFIG_DEBUG_VM is not set +# CONFIG_RCU_TORTURE_TEST is not set CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_KPROBES is not set CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUGGER=y CONFIG_XMON=y CONFIG_XMON_DEFAULT=y -# CONFIG_PPCDBG is not set CONFIG_IRQSTACKS=y +# CONFIG_BOOTX_TEXT is not set # # Security options @@ -1355,17 +1425,3 @@ CONFIG_CRYPTO_TEST=m # # Hardware crypto devices # - -# -# Library routines -# -CONFIG_CRC_CCITT=m -# CONFIG_CRC16 is not set -CONFIG_CRC32=y -CONFIG_LIBCRC32C=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 9a74b7ab03a..4970e3721a8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_PPC_OF) += of_device.o procfs-$(CONFIG_PPC64) := proc_ppc64.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64) := rtas_pci.o -obj-$(CONFIG_PPC_RTAS) += rtas.o $(rtaspci-y) +obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y) obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o obj-$(CONFIG_LPARCFG) += lparcfg.o @@ -49,12 +49,23 @@ extra-y += vmlinux.lds obj-y += process.o init_task.o time.o \ prom.o traps.o setup-common.o obj-$(CONFIG_PPC32) += entry_32.o setup_32.o misc_32.o systbl.o -obj-$(CONFIG_PPC64) += misc_64.o +obj-$(CONFIG_PPC64) += misc_64.o dma_64.o iommu.o obj-$(CONFIG_PPC_OF) += prom_init.o obj-$(CONFIG_MODULES) += ppc_ksyms.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_6xx) += idle_6xx.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_KPROBES) += kprobes.o + +module-$(CONFIG_PPC64) += module_64.o +obj-$(CONFIG_MODULES) += $(module-y) + +pci64-$(CONFIG_PPC64) += pci_64.o pci_dn.o pci_iommu.o \ + pci_direct_iommu.o iomap.o +obj-$(CONFIG_PCI) += $(pci64-y) + +kexec64-$(CONFIG_PPC64) += machine_kexec_64.o +obj-$(CONFIG_KEXEC) += $(kexec64-y) ifeq ($(CONFIG_PPC_ISERIES),y) $(obj)/head_64.o: $(obj)/lparmap.s @@ -62,11 +73,8 @@ AFLAGS_head_64.o += -I$(obj) endif else -# stuff used from here for ARCH=ppc or ARCH=ppc64 +# stuff used from here for ARCH=ppc smpobj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_PPC64) += traps.o process.o init_task.o time.o \ - setup-common.o $(smpobj-y) - endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4550eb4f4fb..91538d2445b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -270,13 +270,15 @@ int main(void) DEFINE(TVAL64_TV_USEC, offsetof(struct timeval, tv_usec)); DEFINE(TVAL32_TV_SEC, offsetof(struct compat_timeval, tv_sec)); DEFINE(TVAL32_TV_USEC, offsetof(struct compat_timeval, tv_usec)); + DEFINE(TSPC64_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC64_TV_NSEC, offsetof(struct timespec, tv_nsec)); DEFINE(TSPC32_TV_SEC, offsetof(struct compat_timespec, tv_sec)); DEFINE(TSPC32_TV_NSEC, offsetof(struct compat_timespec, tv_nsec)); #else DEFINE(TVAL32_TV_SEC, offsetof(struct timeval, tv_sec)); DEFINE(TVAL32_TV_USEC, offsetof(struct timeval, tv_usec)); - DEFINE(TSPEC32_TV_SEC, offsetof(struct timespec, tv_sec)); - DEFINE(TSPEC32_TV_NSEC, offsetof(struct timespec, tv_nsec)); + DEFINE(TSPC32_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC32_TV_NSEC, offsetof(struct timespec, tv_nsec)); #endif /* timeval/timezone offsets for use by vdso */ DEFINE(TZONE_TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c new file mode 100644 index 00000000000..7c3419656cc --- /dev/null +++ b/arch/powerpc/kernel/dma_64.c @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Implements the generic device dma API for ppc64. Handles + * the pci and vio busses + */ + +#include <linux/device.h> +#include <linux/dma-mapping.h> +/* Include the busses we support */ +#include <linux/pci.h> +#include <asm/vio.h> +#include <asm/scatterlist.h> +#include <asm/bug.h> + +static struct dma_mapping_ops *get_dma_ops(struct device *dev) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return &pci_dma_ops; +#endif +#ifdef CONFIG_IBMVIO + if (dev->bus == &vio_bus_type) + return &vio_dma_ops; +#endif + return NULL; +} + +int dma_supported(struct device *dev, u64 mask) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->dma_supported(dev, mask); + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_supported); + +int dma_set_mask(struct device *dev, u64 dma_mask) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return pci_set_dma_mask(to_pci_dev(dev), dma_mask); +#endif +#ifdef CONFIG_IBMVIO + if (dev->bus == &vio_bus_type) + return -EIO; +#endif /* CONFIG_IBMVIO */ + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->alloc_coherent(dev, size, dma_handle, flag); + BUG(); + return NULL; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); + else + BUG(); +} +EXPORT_SYMBOL(dma_free_coherent); + +dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_single(dev, cpu_addr, size, direction); + BUG(); + return (dma_addr_t)0; +} +EXPORT_SYMBOL(dma_map_single); + +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_single(dev, dma_addr, size, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_single); + +dma_addr_t dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_single(dev, + (page_address(page) + offset), size, direction); + BUG(); + return (dma_addr_t)0; +} +EXPORT_SYMBOL(dma_map_page); + +void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_single(dev, dma_address, size, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_page); + +int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_sg(dev, sg, nents, direction); + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_map_sg); + +void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_sg(dev, sg, nhwentries, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_sg); diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c new file mode 100644 index 00000000000..6160c8dbb7c --- /dev/null +++ b/arch/powerpc/kernel/iomap.c @@ -0,0 +1,146 @@ +/* + * arch/ppc64/kernel/iomap.c + * + * ppc64 "iomap" interface implementation. + * + * (C) Copyright 2004 Linus Torvalds + */ +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/mm.h> +#include <asm/io.h> + +/* + * Here comes the ppc64 implementation of the IOMAP + * interfaces. + */ +unsigned int fastcall ioread8(void __iomem *addr) +{ + return readb(addr); +} +unsigned int fastcall ioread16(void __iomem *addr) +{ + return readw(addr); +} +unsigned int fastcall ioread16be(void __iomem *addr) +{ + return in_be16(addr); +} +unsigned int fastcall ioread32(void __iomem *addr) +{ + return readl(addr); +} +unsigned int fastcall ioread32be(void __iomem *addr) +{ + return in_be32(addr); +} +EXPORT_SYMBOL(ioread8); +EXPORT_SYMBOL(ioread16); +EXPORT_SYMBOL(ioread16be); +EXPORT_SYMBOL(ioread32); +EXPORT_SYMBOL(ioread32be); + +void fastcall iowrite8(u8 val, void __iomem *addr) +{ + writeb(val, addr); +} +void fastcall iowrite16(u16 val, void __iomem *addr) +{ + writew(val, addr); +} +void fastcall iowrite16be(u16 val, void __iomem *addr) +{ + out_be16(addr, val); +} +void fastcall iowrite32(u32 val, void __iomem *addr) +{ + writel(val, addr); +} +void fastcall iowrite32be(u32 val, void __iomem *addr) +{ + out_be32(addr, val); +} +EXPORT_SYMBOL(iowrite8); +EXPORT_SYMBOL(iowrite16); +EXPORT_SYMBOL(iowrite16be); +EXPORT_SYMBOL(iowrite32); +EXPORT_SYMBOL(iowrite32be); + +/* + * These are the "repeat read/write" functions. Note the + * non-CPU byte order. We do things in "IO byteorder" + * here. + * + * FIXME! We could make these do EEH handling if we really + * wanted. Not clear if we do. + */ +void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insb((u8 __iomem *) addr, dst, count); +} +void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insw_ns((u16 __iomem *) addr, dst, count); +} +void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insl_ns((u32 __iomem *) addr, dst, count); +} +EXPORT_SYMBOL(ioread8_rep); +EXPORT_SYMBOL(ioread16_rep); +EXPORT_SYMBOL(ioread32_rep); + +void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsb((u8 __iomem *) addr, src, count); +} +void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsw_ns((u16 __iomem *) addr, src, count); +} +void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsl_ns((u32 __iomem *) addr, src, count); +} +EXPORT_SYMBOL(iowrite8_rep); +EXPORT_SYMBOL(iowrite16_rep); +EXPORT_SYMBOL(iowrite32_rep); + +void __iomem *ioport_map(unsigned long port, unsigned int len) +{ + if (!_IO_IS_VALID(port)) + return NULL; + return (void __iomem *) (port+pci_io_base); +} + +void ioport_unmap(void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(ioport_map); +EXPORT_SYMBOL(ioport_unmap); + +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max) +{ + unsigned long start = pci_resource_start(dev, bar); + unsigned long len = pci_resource_len(dev, bar); + unsigned long flags = pci_resource_flags(dev, bar); + + if (!len) + return NULL; + if (max && len > max) + len = max; + if (flags & IORESOURCE_IO) + return ioport_map(start, len); + if (flags & IORESOURCE_MEM) + return ioremap(start, len); + /* What? */ + return NULL; +} + +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(pci_iomap); +EXPORT_SYMBOL(pci_iounmap); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c new file mode 100644 index 00000000000..4d9b4388918 --- /dev/null +++ b/arch/powerpc/kernel/iommu.c @@ -0,0 +1,572 @@ +/* + * arch/ppc64/kernel/iommu.c + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup, new allocation schemes, virtual merging: + * Copyright (C) 2004 Olof Johansson, IBM Corporation + * and Ben. Herrenschmidt, IBM Corporation + * + * Dynamic DMA mapping support, bus-independent parts. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> + +#define DBG(...) + +#ifdef CONFIG_IOMMU_VMERGE +static int novmerge = 0; +#else +static int novmerge = 1; +#endif + +static int __init setup_iommu(char *str) +{ + if (!strcmp(str, "novmerge")) + novmerge = 1; + else if (!strcmp(str, "vmerge")) + novmerge = 0; + return 1; +} + +__setup("iommu=", setup_iommu); + +static unsigned long iommu_range_alloc(struct iommu_table *tbl, + unsigned long npages, + unsigned long *handle, + unsigned int align_order) +{ + unsigned long n, end, i, start; + unsigned long limit; + int largealloc = npages > 15; + int pass = 0; + unsigned long align_mask; + + align_mask = 0xffffffffffffffffl >> (64 - align_order); + + /* This allocator was derived from x86_64's bit string search */ + + /* Sanity check */ + if (unlikely(npages) == 0) { + if (printk_ratelimit()) + WARN_ON(1); + return DMA_ERROR_CODE; + } + + if (handle && *handle) + start = *handle; + else + start = largealloc ? tbl->it_largehint : tbl->it_hint; + + /* Use only half of the table for small allocs (15 pages or less) */ + limit = largealloc ? tbl->it_size : tbl->it_halfpoint; + + if (largealloc && start < tbl->it_halfpoint) + start = tbl->it_halfpoint; + + /* The case below can happen if we have a small segment appended + * to a large, or when the previous alloc was at the very end of + * the available space. If so, go back to the initial start. + */ + if (start >= limit) + start = largealloc ? tbl->it_largehint : tbl->it_hint; + + again: + + n = find_next_zero_bit(tbl->it_map, limit, start); + + /* Align allocation */ + n = (n + align_mask) & ~align_mask; + + end = n + npages; + + if (unlikely(end >= limit)) { + if (likely(pass < 2)) { + /* First failure, just rescan the half of the table. + * Second failure, rescan the other half of the table. + */ + start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; + limit = pass ? tbl->it_size : limit; + pass++; + goto again; + } else { + /* Third failure, give up */ + return DMA_ERROR_CODE; + } + } + + for (i = n; i < end; i++) + if (test_bit(i, tbl->it_map)) { + start = i+1; + goto again; + } + + for (i = n; i < end; i++) + __set_bit(i, tbl->it_map); + + /* Bump the hint to a new block for small allocs. */ + if (largealloc) { + /* Don't bump to new block to avoid fragmentation */ + tbl->it_largehint = end; + } else { + /* Overflow will be taken care of at the next allocation */ + tbl->it_hint = (end + tbl->it_blocksize - 1) & + ~(tbl->it_blocksize - 1); + } + + /* Update handle for SG allocations */ + if (handle) + *handle = end; + + return n; +} + +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, + unsigned int npages, enum dma_data_direction direction, + unsigned int align_order) +{ + unsigned long entry, flags; + dma_addr_t ret = DMA_ERROR_CODE; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + entry = iommu_range_alloc(tbl, npages, NULL, align_order); + + if (unlikely(entry == DMA_ERROR_CODE)) { + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return DMA_ERROR_CODE; + } + + entry += tbl->it_offset; /* Offset into real TCE table */ + ret = entry << PAGE_SHIFT; /* Set the return dma address */ + + /* Put the TCEs in the HW table */ + ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK, + direction); + + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + return ret; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; + unsigned long i; + + entry = dma_addr >> PAGE_SHIFT; + free_entry = entry - tbl->it_offset; + + if (((free_entry + npages) > tbl->it_size) || + (entry < tbl->it_offset)) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_free: invalid entry\n"); + printk(KERN_INFO "\tentry = 0x%lx\n", entry); + printk(KERN_INFO "\tdma_addr = 0x%lx\n", (u64)dma_addr); + printk(KERN_INFO "\tTable = 0x%lx\n", (u64)tbl); + printk(KERN_INFO "\tbus# = 0x%lx\n", (u64)tbl->it_busno); + printk(KERN_INFO "\tsize = 0x%lx\n", (u64)tbl->it_size); + printk(KERN_INFO "\tstartOff = 0x%lx\n", (u64)tbl->it_offset); + printk(KERN_INFO "\tindex = 0x%lx\n", (u64)tbl->it_index); + WARN_ON(1); + } + return; + } + + ppc_md.tce_free(tbl, entry, npages); + + for (i = 0; i < npages; i++) + __clear_bit(free_entry+i, tbl->it_map); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long flags; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + __iommu_free(tbl, dma_addr, npages); + + /* Make sure TLB cache is flushed if the HW needs it. We do + * not do an mb() here on purpose, it is not needed on any of + * the current platforms. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); +} + +int iommu_map_sg(struct device *dev, struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) +{ + dma_addr_t dma_next = 0, dma_addr; + unsigned long flags; + struct scatterlist *s, *outs, *segstart; + int outcount, incount; + unsigned long handle; + + BUG_ON(direction == DMA_NONE); + + if ((nelems == 0) || !tbl) + return 0; + + outs = s = segstart = &sglist[0]; + outcount = 1; + incount = nelems; + handle = 0; + + /* Init first segment length for backout at failure */ + outs->dma_length = 0; + + DBG("mapping %d elements:\n", nelems); + + spin_lock_irqsave(&(tbl->it_lock), flags); + + for (s = outs; nelems; nelems--, s++) { + unsigned long vaddr, npages, entry, slen; + + slen = s->length; + /* Sanity check */ + if (slen == 0) { + dma_next = 0; + continue; + } + /* Allocate iommu entries for that segment */ + vaddr = (unsigned long)page_address(s->page) + s->offset; + npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK); + npages >>= PAGE_SHIFT; + entry = iommu_range_alloc(tbl, npages, &handle, 0); + + DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); + + /* Handle failure */ + if (unlikely(entry == DMA_ERROR_CODE)) { + if (printk_ratelimit()) + printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx" + " npages %lx\n", tbl, vaddr, npages); + goto failure; + } + + /* Convert entry to a dma_addr_t */ + entry += tbl->it_offset; + dma_addr = entry << PAGE_SHIFT; + dma_addr |= s->offset; + + DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n", + npages, entry, dma_addr); + + /* Insert into HW table */ + ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction); + + /* If we are in an open segment, try merging */ + if (segstart != s) { + DBG(" - trying merge...\n"); + /* We cannot merge if: + * - allocated dma_addr isn't contiguous to previous allocation + */ + if (novmerge || (dma_addr != dma_next)) { + /* Can't merge: create a new segment */ + segstart = s; + outcount++; outs++; + DBG(" can't merge, new segment.\n"); + } else { + outs->dma_length += s->length; + DBG(" merged, new len: %lx\n", outs->dma_length); + } + } + + if (segstart == s) { + /* This is a new segment, fill entries */ + DBG(" - filling new segment.\n"); + outs->dma_address = dma_addr; + outs->dma_length = slen; + } + + /* Calculate next page pointer for contiguous check */ + dma_next = dma_addr + slen; + + DBG(" - dma next is: %lx\n", dma_next); + } + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + DBG("mapped %d elements:\n", outcount); + + /* For the sake of iommu_unmap_sg, we clear out the length in the + * next entry of the sglist if we didn't fill the list completely + */ + if (outcount < incount) { + outs++; + outs->dma_address = DMA_ERROR_CODE; + outs->dma_length = 0; + } + return outcount; + + failure: + for (s = &sglist[0]; s <= outs; s++) { + if (s->dma_length != 0) { + unsigned long vaddr, npages; + + vaddr = s->dma_address & PAGE_MASK; + npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr) + >> PAGE_SHIFT; + __iommu_free(tbl, vaddr, npages); + } + } + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return 0; +} + + +void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + unsigned long flags; + + BUG_ON(direction == DMA_NONE); + + if (!tbl) + return; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + while (nelems--) { + unsigned int npages; + dma_addr_t dma_handle = sglist->dma_address; + + if (sglist->dma_length == 0) + break; + npages = (PAGE_ALIGN(dma_handle + sglist->dma_length) + - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT; + __iommu_free(tbl, dma_handle, npages); + sglist++; + } + + /* Flush/invalidate TLBs if necessary. As for iommu_free(), we + * do not do an mb() here, the affected platforms do not need it + * when freeing. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); +} + +/* + * Build a iommu_table structure. This contains a bit map which + * is used to manage allocation of the tce space. + */ +struct iommu_table *iommu_init_table(struct iommu_table *tbl) +{ + unsigned long sz; + static int welcomed = 0; + + /* Set aside 1/4 of the table for large allocations. */ + tbl->it_halfpoint = tbl->it_size * 3 / 4; + + /* number of bytes needed for the bitmap */ + sz = (tbl->it_size + 7) >> 3; + + tbl->it_map = (unsigned long *)__get_free_pages(GFP_ATOMIC, get_order(sz)); + if (!tbl->it_map) + panic("iommu_init_table: Can't allocate %ld bytes\n", sz); + + memset(tbl->it_map, 0, sz); + + tbl->it_hint = 0; + tbl->it_largehint = tbl->it_halfpoint; + spin_lock_init(&tbl->it_lock); + + /* Clear the hardware table in case firmware left allocations in it */ + ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); + + if (!welcomed) { + printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", + novmerge ? "disabled" : "enabled"); + welcomed = 1; + } + + return tbl; +} + +void iommu_free_table(struct device_node *dn) +{ + struct pci_dn *pdn = dn->data; + struct iommu_table *tbl = pdn->iommu_table; + unsigned long bitmap_sz, i; + unsigned int order; + + if (!tbl || !tbl->it_map) { + printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__, + dn->full_name); + return; + } + + /* verify that table contains no entries */ + /* it_size is in entries, and we're examining 64 at a time */ + for (i = 0; i < (tbl->it_size/64); i++) { + if (tbl->it_map[i] != 0) { + printk(KERN_WARNING "%s: Unexpected TCEs for %s\n", + __FUNCTION__, dn->full_name); + break; + } + } + + /* calculate bitmap size in bytes */ + bitmap_sz = (tbl->it_size + 7) / 8; + + /* free bitmap */ + order = get_order(bitmap_sz); + free_pages((unsigned long) tbl->it_map, order); + + /* free table */ + kfree(tbl); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address of the buffer + * passed here is the kernel (virtual) address of the buffer. The buffer + * need not be page aligned, the dma_addr_t returned will point to the same + * byte within the page as vaddr. + */ +dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, + size_t size, enum dma_data_direction direction) +{ + dma_addr_t dma_handle = DMA_ERROR_CODE; + unsigned long uaddr; + unsigned int npages; + + BUG_ON(direction == DMA_NONE); + + uaddr = (unsigned long)vaddr; + npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); + npages >>= PAGE_SHIFT; + + if (tbl) { + dma_handle = iommu_alloc(tbl, vaddr, npages, direction, 0); + if (dma_handle == DMA_ERROR_CODE) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_alloc failed, " + "tbl %p vaddr %p npages %d\n", + tbl, vaddr, npages); + } + } else + dma_handle |= (uaddr & ~PAGE_MASK); + } + + return dma_handle; +} + +void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + + if (tbl) + iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) - + (dma_handle & PAGE_MASK)) >> PAGE_SHIFT); +} + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + order = get_order(size); + + /* + * Client asked for way too much space. This is checked later + * anyway. It is easier to debug here for the drivers than in + * the tce tables. + */ + if (order >= IOMAP_MAX_ORDER) { + printk("iommu_alloc_consistent size too large: 0x%lx\n", size); + return NULL; + } + + if (!tbl) + return NULL; + + /* Alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(flag, order); + if (!ret) + return NULL; + memset(ret, 0, size); + + /* Set up tces to cover the allocated range */ + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL, order); + if (mapping == DMA_ERROR_CODE) { + free_pages((unsigned long)ret, order); + ret = NULL; + } else + *dma_handle = mapping; + return ret; +} + +void iommu_free_coherent(struct iommu_table *tbl, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + unsigned int npages; + + if (tbl) { + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); + } +} diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4b7940693f3..5a71ed9612f 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -71,6 +71,11 @@ #include <asm/paca.h> #endif +int __irq_offset_value; +#ifdef CONFIG_PPC32 +EXPORT_SYMBOL(__irq_offset_value); +#endif + static int ppc_spurious_interrupts; #if defined(CONFIG_PPC_ISERIES) && defined(CONFIG_SMP) @@ -98,7 +103,6 @@ extern atomic_t ipi_sent; EXPORT_SYMBOL(irq_desc); int distribute_irqs = 1; -int __irq_offset_value; u64 ppc64_interrupt_controller; #endif /* CONFIG_PPC64 */ @@ -311,7 +315,6 @@ void __init init_IRQ(void) } #ifdef CONFIG_PPC64 -#ifndef CONFIG_PPC_ISERIES /* * Virtual IRQ mapping code, used on systems with XICS interrupt controllers. */ @@ -420,8 +423,6 @@ unsigned int real_irq_to_virt_slowpath(unsigned int real_irq) } -#endif /* CONFIG_PPC_ISERIES */ - #ifdef CONFIG_IRQSTACKS struct thread_info *softirq_ctx[NR_CPUS]; struct thread_info *hardirq_ctx[NR_CPUS]; diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c new file mode 100644 index 00000000000..511af54e623 --- /dev/null +++ b/arch/powerpc/kernel/kprobes.c @@ -0,0 +1,459 @@ +/* + * Kernel Probes (KProbes) + * arch/ppc64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Nov Ananth N Mavinakayanahalli <ananth@in.ibm.com> kprobes port + * for PPC64 + */ + +#include <linux/config.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <asm/cacheflush.h> +#include <asm/kdebug.h> +#include <asm/sstep.h> + +static DECLARE_MUTEX(kprobe_mutex); +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + int ret = 0; + kprobe_opcode_t insn = *p->addr; + + if ((unsigned long)p->addr & 0x03) { + printk("Attempt to register kprobe at an unaligned address\n"); + ret = -EINVAL; + } else if (IS_MTMSRD(insn) || IS_RFID(insn)) { + printk("Cannot register a kprobe on rfid or mtmsrd\n"); + ret = -EINVAL; + } + + /* insn must be on a special executable page on ppc64 */ + if (!ret) { + down(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + up(&kprobe_mutex); + if (!p->ainsn.insn) + ret = -ENOMEM; + } + return ret; +} + +void __kprobes arch_copy_kprobe(struct kprobe *p) +{ + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + down(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + up(&kprobe_mutex); +} + +static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + kprobe_opcode_t insn = *p->ainsn.insn; + + regs->msr |= MSR_SE; + + /* single step inline if it is a trap variant */ + if (is_trap(insn)) + regs->nip = (unsigned long)p->addr; + else + regs->nip = (unsigned long)p->ainsn.insn; +} + +static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr; +} + +static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_msr = regs->msr; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, + struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = (unsigned long)kretprobe_trampoline; + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + +static inline int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + unsigned int *addr = (unsigned int *)regs->nip; + struct kprobe_ctlblk *kcb; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + kprobe_opcode_t insn = *p->ainsn.insn; + if (kcb->kprobe_status == KPROBE_HIT_SS && + is_trap(insn)) { + regs->msr &= ~MSR_SE; + regs->msr |= kcb->kprobe_saved_msr; + goto no_kprobe; + } + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_saved_msr = regs->msr; + p->nmissed++; + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } else { + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + goto no_kprobe; + } + + p = get_kprobe(addr); + if (!p) { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * PowerPC has multiple variants of the "trap" + * instruction. If the current instruction is a + * trap variant, it could belong to someone else + */ + kprobe_opcode_t cur_insn = *addr; + if (is_trap(cur_insn)) + goto no_kprobe; + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + set_current_kprobe(p, regs, kcb); + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe + * causes the handlers to fire + */ +void kretprobe_trampoline_holder(void) +{ + asm volatile(".global kretprobe_trampoline\n" + "kretprobe_trampoline:\n" + "nop\n"); +} + +/* + * Called when the probe at kretprobe trampoline is hit + */ +int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->nip = orig_ret_address; + + reset_current_kprobe(); + spin_unlock_irqrestore(&kretprobe_lock, flags); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "breakpoint" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + */ +static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + int ret; + unsigned int insn = *p->ainsn.insn; + + regs->nip = (unsigned long)p->addr; + ret = emulate_step(regs, insn); + if (ret == 0) + regs->nip = (unsigned long)p->addr + 4; +} + +static inline int post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs); + regs->msr |= kcb->kprobe_saved_msr; + + /*Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, msr + * will have SE set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->msr & MSR_SE) + return 0; + + return 1; +} + +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + if (kcb->kprobe_status & KPROBE_HIT_SS) { + resume_execution(cur, regs); + regs->msr &= ~MSR_SE; + regs->msr |= kcb->kprobe_saved_msr; + + reset_current_kprobe(); + preempt_enable_no_resched(); + } + return 0; +} + +/* + * Wrapper routine to for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + switch (val) { + case DIE_BPT: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_SSTEP: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + /* kprobe_running() needs smp_processor_id() */ + preempt_disable(); + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + preempt_enable(); + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs)); + + /* setup return addr to the jprobe handler routine */ + regs->nip = (unsigned long)(((func_descr_t *)jp->entry)->entry); + regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); + + return 1; +} + +void __kprobes jprobe_return(void) +{ + asm volatile("trap" ::: "memory"); +} + +void __kprobes jprobe_return_end(void) +{ +}; + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + /* + * FIXME - we should ideally be validating that we got here 'cos + * of the "trap" in jprobe_return() above, before restoring the + * saved regs... + */ + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); + preempt_enable_no_resched(); + return 1; +} + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c index 1b3ba8a440a..9dda16ccde7 100644 --- a/arch/powerpc/kernel/lparcfg.c +++ b/arch/powerpc/kernel/lparcfg.c @@ -42,32 +42,6 @@ /* #define LPARCFG_DEBUG */ -/* find a better place for this function... */ -static void log_plpar_hcall_return(unsigned long rc, char *tag) -{ - if (rc == 0) /* success, return */ - return; -/* check for null tag ? */ - if (rc == H_Hardware) - printk(KERN_INFO - "plpar-hcall (%s) failed with hardware fault\n", tag); - else if (rc == H_Function) - printk(KERN_INFO - "plpar-hcall (%s) failed; function not allowed\n", tag); - else if (rc == H_Authority) - printk(KERN_INFO - "plpar-hcall (%s) failed; not authorized to this function\n", - tag); - else if (rc == H_Parameter) - printk(KERN_INFO "plpar-hcall (%s) failed; Bad parameter(s)\n", - tag); - else - printk(KERN_INFO - "plpar-hcall (%s) failed with unexpected rc(0x%lx)\n", - tag, rc); - -} - static struct proc_dir_entry *proc_ppc64_lparcfg; #define LPARCFG_BUFF_SIZE 4096 @@ -172,6 +146,31 @@ static int lparcfg_data(struct seq_file *m, void *v) /* * Methods used to fetch LPAR data when running on a pSeries platform. */ +/* find a better place for this function... */ +static void log_plpar_hcall_return(unsigned long rc, char *tag) +{ + if (rc == 0) /* success, return */ + return; +/* check for null tag ? */ + if (rc == H_Hardware) + printk(KERN_INFO + "plpar-hcall (%s) failed with hardware fault\n", tag); + else if (rc == H_Function) + printk(KERN_INFO + "plpar-hcall (%s) failed; function not allowed\n", tag); + else if (rc == H_Authority) + printk(KERN_INFO + "plpar-hcall (%s) failed; not authorized to this function\n", + tag); + else if (rc == H_Parameter) + printk(KERN_INFO "plpar-hcall (%s) failed; Bad parameter(s)\n", + tag); + else + printk(KERN_INFO + "plpar-hcall (%s) failed with unexpected rc(0x%lx)\n", + tag, rc); + +} /* * H_GET_PPP hcall returns info in 4 parms. diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c new file mode 100644 index 00000000000..97c51e452be --- /dev/null +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -0,0 +1,358 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * + * Copyright (C) 2004-2005, IBM Corp. + * + * Created by: Milton D Miller II + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + +#include <linux/cpumask.h> +#include <linux/kexec.h> +#include <linux/smp.h> +#include <linux/thread_info.h> +#include <linux/errno.h> + +#include <asm/page.h> +#include <asm/current.h> +#include <asm/machdep.h> +#include <asm/cacheflush.h> +#include <asm/paca.h> +#include <asm/mmu.h> +#include <asm/sections.h> /* _end */ +#include <asm/prom.h> +#include <asm/smp.h> + +#define HASH_GROUP_SIZE 0x80 /* size of each hash group, asm/mmu.h */ + +/* Have this around till we move it into crash specific file */ +note_buf_t crash_notes[NR_CPUS]; + +/* Dummy for now. Not sure if we need to have a crash shutdown in here + * and if what it will achieve. Letting it be now to compile the code + * in generic kexec environment + */ +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* do nothing right now */ + /* smp_relase_cpus() if we want smp on panic kernel */ + /* cpu_irq_down to isolate us until we are ready */ +} + +int machine_kexec_prepare(struct kimage *image) +{ + int i; + unsigned long begin, end; /* limits of segment */ + unsigned long low, high; /* limits of blocked memory range */ + struct device_node *node; + unsigned long *basep; + unsigned int *sizep; + + if (!ppc_md.hpte_clear_all) + return -ENOENT; + + /* + * Since we use the kernel fault handlers and paging code to + * handle the virtual mode, we must make sure no destination + * overlaps kernel static data or bss. + */ + for (i = 0; i < image->nr_segments; i++) + if (image->segment[i].mem < __pa(_end)) + return -ETXTBSY; + + /* + * For non-LPAR, we absolutely can not overwrite the mmu hash + * table, since we are still using the bolted entries in it to + * do the copy. Check that here. + * + * It is safe if the end is below the start of the blocked + * region (end <= low), or if the beginning is after the + * end of the blocked region (begin >= high). Use the + * boolean identity !(a || b) === (!a && !b). + */ + if (htab_address) { + low = __pa(htab_address); + high = low + (htab_hash_mask + 1) * HASH_GROUP_SIZE; + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + /* We also should not overwrite the tce tables */ + for (node = of_find_node_by_type(NULL, "pci"); node != NULL; + node = of_find_node_by_type(node, "pci")) { + basep = (unsigned long *)get_property(node, "linux,tce-base", + NULL); + sizep = (unsigned int *)get_property(node, "linux,tce-size", + NULL); + if (basep == NULL || sizep == NULL) + continue; + + low = *basep; + high = low + (*sizep); + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + /* we do nothing in prepare that needs to be undone */ +} + +#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) + +static void copy_segments(unsigned long ind) +{ + unsigned long entry; + unsigned long *ptr; + void *dest; + void *addr; + + /* + * We rely on kexec_load to create a lists that properly + * initializes these pointers before they are used. + * We will still crash if the list is wrong, but at least + * the compiler will be quiet. + */ + ptr = NULL; + dest = NULL; + + for (entry = ind; !(entry & IND_DONE); entry = *ptr++) { + addr = __va(entry & PAGE_MASK); + + switch (entry & IND_FLAGS) { + case IND_DESTINATION: + dest = addr; + break; + case IND_INDIRECTION: + ptr = addr; + break; + case IND_SOURCE: + copy_page(dest, addr); + dest += PAGE_SIZE; + } + } +} + +void kexec_copy_flush(struct kimage *image) +{ + long i, nr_segments = image->nr_segments; + struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; + + /* save the ranges on the stack to efficiently flush the icache */ + memcpy(ranges, image->segment, sizeof(ranges)); + + /* + * After this call we may not use anything allocated in dynamic + * memory, including *image. + * + * Only globals and the stack are allowed. + */ + copy_segments(image->head); + + /* + * we need to clear the icache for all dest pages sometime, + * including ones that were in place on the original copy + */ + for (i = 0; i < nr_segments; i++) + flush_icache_range(ranges[i].mem + KERNELBASE, + ranges[i].mem + KERNELBASE + + ranges[i].memsz); +} + +#ifdef CONFIG_SMP + +/* FIXME: we should schedule this function to be called on all cpus based + * on calling the interrupts, but we would like to call it off irq level + * so that the interrupt controller is clean. + */ +void kexec_smp_down(void *arg) +{ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 1); + + local_irq_disable(); + kexec_smp_wait(); + /* NOTREACHED */ +} + +static void kexec_prepare_cpus(void) +{ + int my_cpu, i, notified=-1; + + smp_call_function(kexec_smp_down, NULL, 0, /* wait */0); + my_cpu = get_cpu(); + + /* check the others cpus are now down (via paca hw cpu id == -1) */ + for (i=0; i < NR_CPUS; i++) { + if (i == my_cpu) + continue; + + while (paca[i].hw_cpu_id != -1) { + barrier(); + if (!cpu_possible(i)) { + printk("kexec: cpu %d hw_cpu_id %d is not" + " possible, ignoring\n", + i, paca[i].hw_cpu_id); + break; + } + if (!cpu_online(i)) { + /* Fixme: this can be spinning in + * pSeries_secondary_wait with a paca + * waiting for it to go online. + */ + printk("kexec: cpu %d hw_cpu_id %d is not" + " online, ignoring\n", + i, paca[i].hw_cpu_id); + break; + } + if (i != notified) { + printk( "kexec: waiting for cpu %d (physical" + " %d) to go down\n", + i, paca[i].hw_cpu_id); + notified = i; + } + } + } + + /* after we tell the others to go down */ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + + put_cpu(); + + local_irq_disable(); +} + +#else /* ! SMP */ + +static void kexec_prepare_cpus(void) +{ + /* + * move the secondarys to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + * + * We need to release the cpus if we are ever going from an + * UP to an SMP kernel. + */ + smp_release_cpus(); + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + local_irq_disable(); +} + +#endif /* SMP */ + +/* + * kexec thread structure and stack. + * + * We need to make sure that this is 16384-byte aligned due to the + * way process stacks are handled. It also must be statically allocated + * or allocated as part of the kimage, because everything else may be + * overwritten when we copy the kexec image. We piggyback on the + * "init_task" linker section here to statically allocate a stack. + * + * We could use a smaller stack if we don't care about anything using + * current, but that audit has not been performed. + */ +union thread_union kexec_stack + __attribute__((__section__(".data.init_task"))) = { }; + +/* Our assembly helper, in kexec_stub.S */ +extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void)) ATTRIB_NORET; + +/* too late to fail here */ +void machine_kexec(struct kimage *image) +{ + + /* prepare control code if any */ + + /* shutdown other cpus into our wait loop and quiesce interrupts */ + kexec_prepare_cpus(); + + /* switch to a staticly allocated stack. Based on irq stack code. + * XXX: the task struct will likely be invalid once we do the copy! + */ + kexec_stack.thread_info.task = current_thread_info()->task; + kexec_stack.thread_info.flags = 0; + + /* Some things are best done in assembly. Finding globals with + * a toc is easier in C, so pass in what we can. + */ + kexec_sequence(&kexec_stack, image->start, image, + page_address(image->control_code_page), + ppc_md.hpte_clear_all); + /* NOTREACHED */ +} + +/* Values we need to export to the second kernel via the device tree. */ +static unsigned long htab_base, htab_size, kernel_end; + +static struct property htab_base_prop = { + .name = "linux,htab-base", + .length = sizeof(unsigned long), + .value = (unsigned char *)&htab_base, +}; + +static struct property htab_size_prop = { + .name = "linux,htab-size", + .length = sizeof(unsigned long), + .value = (unsigned char *)&htab_size, +}; + +static struct property kernel_end_prop = { + .name = "linux,kernel-end", + .length = sizeof(unsigned long), + .value = (unsigned char *)&kernel_end, +}; + +static void __init export_htab_values(void) +{ + struct device_node *node; + + node = of_find_node_by_path("/chosen"); + if (!node) + return; + + kernel_end = __pa(_end); + prom_add_property(node, &kernel_end_prop); + + /* On machines with no htab htab_address is NULL */ + if (NULL == htab_address) + goto out; + + htab_base = __pa(htab_address); + prom_add_property(node, &htab_base_prop); + + htab_size = 1UL << ppc64_pft_size; + prom_add_property(node, &htab_size_prop); + + out: + of_node_put(node); +} + +void __init kexec_setup(void) +{ + export_htab_values(); +} diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c new file mode 100644 index 00000000000..928b8581fcb --- /dev/null +++ b/arch/powerpc/kernel/module_64.c @@ -0,0 +1,455 @@ +/* Kernel module help for PPC64. + Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/module.h> +#include <linux/elf.h> +#include <linux/moduleloader.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <asm/module.h> +#include <asm/uaccess.h> + +/* FIXME: We don't do .init separately. To do this, we'd need to have + a separate r2 value in the init and core section, and stub between + them, too. + + Using a magic allocator which places modules within 32MB solves + this, and makes other things simpler. Anton? + --RR. */ +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +/* There's actually a third entry here, but it's unused */ +struct ppc64_opd_entry +{ + unsigned long funcaddr; + unsigned long r2; +}; + +/* Like PPC32, we need little trampolines to do > 24-bit jumps (into + the kernel itself). But on PPC64, these need to be used for every + jump, actually, to reset r2 (TOC+0x8000). */ +struct ppc64_stub_entry +{ + /* 28 byte jump instruction sequence (7 instructions) */ + unsigned char jump[28]; + unsigned char unused[4]; + /* Data for the above code */ + struct ppc64_opd_entry opd; +}; + +/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external) + function which may be more than 24-bits away. We could simply + patch the new r2 value and function pointer into the stub, but it's + significantly shorter to put these values at the end of the stub + code, and patch the stub address (32-bits relative to the TOC ptr, + r2) into the stub. */ +static struct ppc64_stub_entry ppc64_stub = +{ .jump = { + 0x3d, 0x82, 0x00, 0x00, /* addis r12,r2, <high> */ + 0x39, 0x8c, 0x00, 0x00, /* addi r12,r12, <low> */ + /* Save current r2 value in magic place on the stack. */ + 0xf8, 0x41, 0x00, 0x28, /* std r2,40(r1) */ + 0xe9, 0x6c, 0x00, 0x20, /* ld r11,32(r12) */ + 0xe8, 0x4c, 0x00, 0x28, /* ld r2,40(r12) */ + 0x7d, 0x69, 0x03, 0xa6, /* mtctr r11 */ + 0x4e, 0x80, 0x04, 0x20 /* bctr */ +} }; + +/* Count how many different 24-bit relocations (different symbol, + different addend) */ +static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i, j, ret = 0; + + /* FIXME: Only count external ones --RR */ + /* Sure, this is order(n^2), but it's usually short, and not + time critical */ + for (i = 0; i < num; i++) { + /* Only count 24-bit relocs, others don't need stubs */ + if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24) + continue; + for (j = 0; j < i; j++) { + /* If this addend appeared before, it's + already been counted */ + if (rela[i].r_info == rela[j].r_info + && rela[i].r_addend == rela[j].r_addend) + break; + } + if (j == i) ret++; + } + return ret; +} + +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + + return vmalloc_exec(size); +} + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* Get size of potential trampolines required. */ +static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, + const Elf64_Shdr *sechdrs) +{ + /* One extra reloc so it's always 0-funcaddr terminated */ + unsigned long relocs = 1; + unsigned i; + + /* Every relocated section... */ + for (i = 1; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_RELA) { + DEBUGP("Found relocations in section %u\n", i); + DEBUGP("Ptr: %p. Number: %lu\n", + (void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size / sizeof(Elf64_Rela)); + relocs += count_relocs((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size + / sizeof(Elf64_Rela)); + } + } + + DEBUGP("Looks like a total of %lu stubs, max\n", relocs); + return relocs * sizeof(struct ppc64_stub_entry); +} + +static void dedotify_versions(struct modversion_info *vers, + unsigned long size) +{ + struct modversion_info *end; + + for (end = (void *)vers + size; vers < end; vers++) + if (vers->name[0] == '.') + memmove(vers->name, vers->name+1, strlen(vers->name)); +} + +/* Undefined symbols which refer to .funcname, hack to funcname */ +static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) +{ + unsigned int i; + + for (i = 1; i < numsyms; i++) { + if (syms[i].st_shndx == SHN_UNDEF) { + char *name = strtab + syms[i].st_name; + if (name[0] == '.') + memmove(name, name+1, strlen(name)); + } + } +} + +int module_frob_arch_sections(Elf64_Ehdr *hdr, + Elf64_Shdr *sechdrs, + char *secstrings, + struct module *me) +{ + unsigned int i; + + /* Find .toc and .stubs sections, symtab and strtab */ + for (i = 1; i < hdr->e_shnum; i++) { + char *p; + if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0) + me->arch.stubs_section = i; + else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0) + me->arch.toc_section = i; + else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0) + dedotify_versions((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size); + + /* We don't handle .init for the moment: rename to _init */ + while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) + p[0] = '_'; + + if (sechdrs[i].sh_type == SHT_SYMTAB) + dedotify((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size / sizeof(Elf64_Sym), + (void *)hdr + + sechdrs[sechdrs[i].sh_link].sh_offset); + } + if (!me->arch.stubs_section || !me->arch.toc_section) { + printk("%s: doesn't contain .toc or .stubs.\n", me->name); + return -ENOEXEC; + } + + /* Override the stubs size */ + sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs); + return 0; +} + +int apply_relocate(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "%s: Non-ADD RELOCATION unsupported\n", me->name); + return -ENOEXEC; +} + +/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this + gives the value maximum span in an instruction which uses a signed + offset) */ +static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me) +{ + return sechdrs[me->arch.toc_section].sh_addr + 0x8000; +} + +/* Both low and high 16 bits are added as SIGNED additions, so if low + 16 bits has high bit set, high 16 bits must be adjusted. These + macros do that (stolen from binutils). */ +#define PPC_LO(v) ((v) & 0xffff) +#define PPC_HI(v) (((v) >> 16) & 0xffff) +#define PPC_HA(v) PPC_HI ((v) + 0x8000) + +/* Patch stub to reference function and correct r2 value. */ +static inline int create_stub(Elf64_Shdr *sechdrs, + struct ppc64_stub_entry *entry, + struct ppc64_opd_entry *opd, + struct module *me) +{ + Elf64_Half *loc1, *loc2; + long reladdr; + + *entry = ppc64_stub; + + loc1 = (Elf64_Half *)&entry->jump[2]; + loc2 = (Elf64_Half *)&entry->jump[6]; + + /* Stub uses address relative to r2. */ + reladdr = (unsigned long)entry - my_r2(sechdrs, me); + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + printk("%s: Address %p of stub out of range of %p.\n", + me->name, (void *)reladdr, (void *)my_r2); + return 0; + } + DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr); + + *loc1 = PPC_HA(reladdr); + *loc2 = PPC_LO(reladdr); + entry->opd.funcaddr = opd->funcaddr; + entry->opd.r2 = opd->r2; + return 1; +} + +/* Create stub to jump to function described in this OPD: we need the + stub to set up the TOC ptr (r2) for the function. */ +static unsigned long stub_for_addr(Elf64_Shdr *sechdrs, + unsigned long opdaddr, + struct module *me) +{ + struct ppc64_stub_entry *stubs; + struct ppc64_opd_entry *opd = (void *)opdaddr; + unsigned int i, num_stubs; + + num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs); + + /* Find this stub, or if that fails, the next avail. entry */ + stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; + for (i = 0; stubs[i].opd.funcaddr; i++) { + BUG_ON(i >= num_stubs); + + if (stubs[i].opd.funcaddr == opd->funcaddr) + return (unsigned long)&stubs[i]; + } + + if (!create_stub(sechdrs, &stubs[i], opd, me)) + return 0; + + return (unsigned long)&stubs[i]; +} + +/* We expect a noop next: if it is, replace it with instruction to + restore r2. */ +static int restore_r2(u32 *instruction, struct module *me) +{ + if (*instruction != 0x60000000) { + printk("%s: Expect noop after relocate, got %08x\n", + me->name, *instruction); + return 0; + } + *instruction = 0xe8410028; /* ld r2,40(r1) */ + return 1; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + unsigned long *location; + unsigned long value; + + DEBUGP("Applying ADD relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rela[i].r_offset; + /* This is the symbol it is referring to */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rela[i].r_info); + + DEBUGP("RELOC at %p: %li-type as %s (%lu) + %li\n", + location, (long)ELF64_R_TYPE(rela[i].r_info), + strtab + sym->st_name, (unsigned long)sym->st_value, + (long)rela[i].r_addend); + + /* `Everything is relative'. */ + value = sym->st_value + rela[i].r_addend; + + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_PPC64_ADDR32: + /* Simply set it */ + *(u32 *)location = value; + break; + + case R_PPC64_ADDR64: + /* Simply set it */ + *(unsigned long *)location = value; + break; + + case R_PPC64_TOC: + *(unsigned long *)location = my_r2(sechdrs, me); + break; + + case R_PPC64_TOC16: + /* Subtact TOC pointer */ + value -= my_r2(sechdrs, me); + if (value + 0x8000 > 0xffff) { + printk("%s: bad TOC16 relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC64_TOC16_DS: + /* Subtact TOC pointer */ + value -= my_r2(sechdrs, me); + if ((value & 3) != 0 || value + 0x8000 > 0xffff) { + printk("%s: bad TOC16_DS relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xfffc) + | (value & 0xfffc); + break; + + case R_PPC_REL24: + /* FIXME: Handle weak symbols here --RR */ + if (sym->st_shndx == SHN_UNDEF) { + /* External: go via stub */ + value = stub_for_addr(sechdrs, value, me); + if (!value) + return -ENOENT; + if (!restore_r2((u32 *)location + 1, me)) + return -ENOEXEC; + } + + /* Convert value to relative */ + value -= (unsigned long)location; + if (value + 0x2000000 > 0x3ffffff || (value & 3) != 0){ + printk("%s: REL24 %li out of range!\n", + me->name, (long int)value); + return -ENOEXEC; + } + + /* Only replace bits 2 through 26 */ + *(uint32_t *)location + = (*(uint32_t *)location & ~0x03fffffc) + | (value & 0x03fffffc); + break; + + default: + printk("%s: Unknown ADD relocation: %lu\n", + me->name, + (unsigned long)ELF64_R_TYPE(rela[i].r_info)); + return -ENOEXEC; + } + } + + return 0; +} + +LIST_HEAD(module_bug_list); + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, struct module *me) +{ + char *secstrings; + unsigned int i; + + me->arch.bug_table = NULL; + me->arch.num_bugs = 0; + + /* Find the __bug_table section, if present */ + secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (i = 1; i < hdr->e_shnum; i++) { + if (strcmp(secstrings+sechdrs[i].sh_name, "__bug_table")) + continue; + me->arch.bug_table = (void *) sechdrs[i].sh_addr; + me->arch.num_bugs = sechdrs[i].sh_size / sizeof(struct bug_entry); + break; + } + + /* + * Strictly speaking this should have a spinlock to protect against + * traversals, but since we only traverse on BUG()s, a spinlock + * could potentially lead to deadlock and thus be counter-productive. + */ + list_add(&me->arch.bug_list, &module_bug_list); + + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ + list_del(&mod->arch.bug_list); +} + +struct bug_entry *module_find_bug(unsigned long bugaddr) +{ + struct mod_arch_specific *mod; + unsigned int i; + struct bug_entry *bug; + + list_for_each_entry(mod, &module_bug_list, bug_list) { + bug = mod->bug_table; + for (i = 0; i < mod->num_bugs; ++i, ++bug) + if (bugaddr == bug->bug_addr) + return bug; + } + return NULL; +} diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c new file mode 100644 index 00000000000..3cef1b8f57f --- /dev/null +++ b/arch/powerpc/kernel/pci_64.c @@ -0,0 +1,1319 @@ +/* + * Port for PPC64 David Engebretsen, IBM Corp. + * Contains common pci routines for ppc64 platform, pSeries and iSeries brands. + * + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * Rework, based on alpha PCI code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/syscalls.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/byteorder.h> +#include <asm/irq.h> +#include <asm/machdep.h> +#include <asm/udbg.h> +#include <asm/ppc-pci.h> + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +unsigned long pci_probe_only = 1; +unsigned long pci_assign_all_buses = 0; + +/* + * legal IO pages under MAX_ISA_PORT. This is to ensure we don't touch + * devices we don't have access to. + */ +unsigned long io_page_mask; + +EXPORT_SYMBOL(io_page_mask); + +#ifdef CONFIG_PPC_MULTIPLATFORM +static void fixup_resource(struct resource *res, struct pci_dev *dev); +static void do_bus_setup(struct pci_bus *bus); +#endif + +unsigned int pcibios_assign_all_busses(void) +{ + return pci_assign_all_buses; +} + +/* pci_io_base -- the base address from which io bars are offsets. + * This is the lowest I/O base address (so bar values are always positive), + * and it *must* be the start of ISA space if an ISA bus exists because + * ISA drivers use hard coded offsets. If no ISA bus exists a dummy + * page is mapped and isa_io_limit prevents access to it. + */ +unsigned long isa_io_base; /* NULL if no ISA bus */ +EXPORT_SYMBOL(isa_io_base); +unsigned long pci_io_base; +EXPORT_SYMBOL(pci_io_base); + +void iSeries_pcibios_init(void); + +LIST_HEAD(hose_list); + +struct dma_mapping_ops pci_dma_ops; +EXPORT_SYMBOL(pci_dma_ops); + +int global_phb_number; /* Global phb counter */ + +/* Cached ISA bridge dev. */ +struct pci_dev *ppc64_isabridge_dev = NULL; + +static void fixup_broken_pcnet32(struct pci_dev* dev) +{ + if ((dev->class>>8 == PCI_CLASS_NETWORK_ETHERNET)) { + dev->vendor = PCI_VENDOR_ID_AMD; + pci_write_config_word(dev, PCI_VENDOR_ID, PCI_VENDOR_ID_AMD); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TRIDENT, PCI_ANY_ID, fixup_broken_pcnet32); + +void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, + struct resource *res) +{ + unsigned long offset = 0; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + if (!hose) + return; + + if (res->flags & IORESOURCE_IO) + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + if (res->flags & IORESOURCE_MEM) + offset = hose->pci_mem_offset; + + region->start = res->start - offset; + region->end = res->end - offset; +} + +void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region) +{ + unsigned long offset = 0; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + if (!hose) + return; + + if (res->flags & IORESOURCE_IO) + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + if (res->flags & IORESOURCE_MEM) + offset = hose->pci_mem_offset; + + res->start = region->start + offset; + res->end = region->end + offset; +} + +#ifdef CONFIG_HOTPLUG +EXPORT_SYMBOL(pcibios_resource_to_bus); +EXPORT_SYMBOL(pcibios_bus_to_resource); +#endif + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + * + * Why? Because some silly external IO cards only decode + * the low 10 bits of the IO address. The 0x00-0xff region + * is reserved for motherboard devices that decode all 16 + * bits, so it's ok to allocate at, say, 0x2800-0x28ff, + * but we want to try to avoid allocating at 0x2900-0x2bff + * which might have be mirrored at 0x0100-0x03ff.. + */ +void pcibios_align_resource(void *data, struct resource *res, + unsigned long size, unsigned long align) +{ + struct pci_dev *dev = data; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long start = res->start; + unsigned long alignto; + + if (res->flags & IORESOURCE_IO) { + unsigned long offset = (unsigned long)hose->io_base_virt - + pci_io_base; + /* Make sure we start at our min on all hoses */ + if (start - offset < PCIBIOS_MIN_IO) + start = PCIBIOS_MIN_IO + offset; + + /* + * Put everything into 0x00-0xff region modulo 0x400 + */ + if (start & 0x300) + start = (start + 0x3ff) & ~0x3ff; + + } else if (res->flags & IORESOURCE_MEM) { + /* Make sure we start at our min on all hoses */ + if (start - hose->pci_mem_offset < PCIBIOS_MIN_MEM) + start = PCIBIOS_MIN_MEM + hose->pci_mem_offset; + + /* Align to multiple of size of minimum base. */ + alignto = max(0x1000UL, align); + start = ALIGN(start, alignto); + } + + res->start = start; +} + +static DEFINE_SPINLOCK(hose_spinlock); + +/* + * pci_controller(phb) initialized common variables. + */ +void __devinit pci_setup_pci_controller(struct pci_controller *hose) +{ + memset(hose, 0, sizeof(struct pci_controller)); + + spin_lock(&hose_spinlock); + hose->global_number = global_phb_number++; + list_add_tail(&hose->list_node, &hose_list); + spin_unlock(&hose_spinlock); +} + +static void __init pcibios_claim_one_bus(struct pci_bus *b) +{ + struct pci_dev *dev; + struct pci_bus *child_bus; + + list_for_each_entry(dev, &b->devices, bus_list) { + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (r->parent || !r->start || !r->flags) + continue; + pci_claim_resource(dev, i); + } + } + + list_for_each_entry(child_bus, &b->children, node) + pcibios_claim_one_bus(child_bus); +} + +#ifndef CONFIG_PPC_ISERIES +static void __init pcibios_claim_of_setup(void) +{ + struct pci_bus *b; + + list_for_each_entry(b, &pci_root_buses, node) + pcibios_claim_one_bus(b); +} +#endif + +#ifdef CONFIG_PPC_MULTIPLATFORM +static u32 get_int_prop(struct device_node *np, const char *name, u32 def) +{ + u32 *prop; + int len; + + prop = (u32 *) get_property(np, name, &len); + if (prop && len >= 4) + return *prop; + return def; +} + +static unsigned int pci_parse_of_flags(u32 addr0) +{ + unsigned int flags = 0; + + if (addr0 & 0x02000000) { + flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; + flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; + flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x40000000) + flags |= IORESOURCE_PREFETCH + | PCI_BASE_ADDRESS_MEM_PREFETCH; + } else if (addr0 & 0x01000000) + flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + return flags; +} + +#define GET_64BIT(prop, i) ((((u64) (prop)[(i)]) << 32) | (prop)[(i)+1]) + +static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev) +{ + u64 base, size; + unsigned int flags; + struct resource *res; + u32 *addrs, i; + int proplen; + + addrs = (u32 *) get_property(node, "assigned-addresses", &proplen); + if (!addrs) + return; + for (; proplen >= 20; proplen -= 20, addrs += 5) { + flags = pci_parse_of_flags(addrs[0]); + if (!flags) + continue; + base = GET_64BIT(addrs, 1); + size = GET_64BIT(addrs, 3); + if (!size) + continue; + i = addrs[0] & 0xff; + if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { + res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; + } else if (i == dev->rom_base_reg) { + res = &dev->resource[PCI_ROM_RESOURCE]; + flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; + } else { + printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); + continue; + } + res->start = base; + res->end = base + size - 1; + res->flags = flags; + res->name = pci_name(dev); + fixup_resource(res, dev); + } +} + +struct pci_dev *of_create_pci_dev(struct device_node *node, + struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + const char *type; + + dev = kmalloc(sizeof(struct pci_dev), GFP_KERNEL); + if (!dev) + return NULL; + type = get_property(node, "device_type", NULL); + if (type == NULL) + type = ""; + + memset(dev, 0, sizeof(struct pci_dev)); + dev->bus = bus; + dev->sysdata = node; + dev->dev.parent = bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->devfn = devfn; + dev->multifunction = 0; /* maybe a lie? */ + + dev->vendor = get_int_prop(node, "vendor-id", 0xffff); + dev->device = get_int_prop(node, "device-id", 0xffff); + dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); + dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); + + dev->cfg_size = 256; /*pci_cfg_space_size(dev);*/ + + sprintf(pci_name(dev), "%04x:%02x:%02x.%d", pci_domain_nr(bus), + dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); + dev->class = get_int_prop(node, "class-code", 0); + + dev->current_state = 4; /* unknown power state */ + + if (!strcmp(type, "pci")) { + /* a PCI-PCI bridge */ + dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; + dev->rom_base_reg = PCI_ROM_ADDRESS1; + } else if (!strcmp(type, "cardbus")) { + dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; + } else { + dev->hdr_type = PCI_HEADER_TYPE_NORMAL; + dev->rom_base_reg = PCI_ROM_ADDRESS; + dev->irq = NO_IRQ; + if (node->n_intrs > 0) { + dev->irq = node->intrs[0].line; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, + dev->irq); + } + } + + pci_parse_of_addrs(node, dev); + + pci_device_add(dev, bus); + + /* XXX pci_scan_msi_device(dev); */ + + return dev; +} +EXPORT_SYMBOL(of_create_pci_dev); + +void __devinit of_scan_bus(struct device_node *node, + struct pci_bus *bus) +{ + struct device_node *child = NULL; + u32 *reg; + int reglen, devfn; + struct pci_dev *dev; + + while ((child = of_get_next_child(node, child)) != NULL) { + reg = (u32 *) get_property(child, "reg", ®len); + if (reg == NULL || reglen < 20) + continue; + devfn = (reg[0] >> 8) & 0xff; + /* create a new pci_dev for this device */ + dev = of_create_pci_dev(child, bus, devfn); + if (!dev) + continue; + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + of_scan_pci_bridge(child, dev); + } + + do_bus_setup(bus); +} +EXPORT_SYMBOL(of_scan_bus); + +void __devinit of_scan_pci_bridge(struct device_node *node, + struct pci_dev *dev) +{ + struct pci_bus *bus; + u32 *busrange, *ranges; + int len, i, mode; + struct resource *res; + unsigned int flags; + u64 size; + + /* parse bus-range property */ + busrange = (u32 *) get_property(node, "bus-range", &len); + if (busrange == NULL || len != 8) { + printk(KERN_ERR "Can't get bus-range for PCI-PCI bridge %s\n", + node->full_name); + return; + } + ranges = (u32 *) get_property(node, "ranges", &len); + if (ranges == NULL) { + printk(KERN_ERR "Can't get ranges for PCI-PCI bridge %s\n", + node->full_name); + return; + } + + bus = pci_add_new_bus(dev->bus, dev, busrange[0]); + if (!bus) { + printk(KERN_ERR "Failed to create pci bus for %s\n", + node->full_name); + return; + } + + bus->primary = dev->bus->number; + bus->subordinate = busrange[1]; + bus->bridge_ctl = 0; + bus->sysdata = node; + + /* parse ranges property */ + /* PCI #address-cells == 3 and #size-cells == 2 always */ + res = &dev->resource[PCI_BRIDGE_RESOURCES]; + for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { + res->flags = 0; + bus->resource[i] = res; + ++res; + } + i = 1; + for (; len >= 32; len -= 32, ranges += 8) { + flags = pci_parse_of_flags(ranges[0]); + size = GET_64BIT(ranges, 6); + if (flags == 0 || size == 0) + continue; + if (flags & IORESOURCE_IO) { + res = bus->resource[0]; + if (res->flags) { + printk(KERN_ERR "PCI: ignoring extra I/O range" + " for bridge %s\n", node->full_name); + continue; + } + } else { + if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { + printk(KERN_ERR "PCI: too many memory ranges" + " for bridge %s\n", node->full_name); + continue; + } + res = bus->resource[i]; + ++i; + } + res->start = GET_64BIT(ranges, 1); + res->end = res->start + size - 1; + res->flags = flags; + fixup_resource(res, dev); + } + sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), + bus->number); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + if (mode == PCI_PROBE_DEVTREE) + of_scan_bus(node, bus); + else if (mode == PCI_PROBE_NORMAL) + pci_scan_child_bus(bus); +} +EXPORT_SYMBOL(of_scan_pci_bridge); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + +void __devinit scan_phb(struct pci_controller *hose) +{ + struct pci_bus *bus; + struct device_node *node = hose->arch_data; + int i, mode; + struct resource *res; + + bus = pci_create_bus(NULL, hose->first_busno, hose->ops, node); + if (bus == NULL) { + printk(KERN_ERR "Failed to create bus for PCI domain %04x\n", + hose->global_number); + return; + } + bus->secondary = hose->first_busno; + hose->bus = bus; + + bus->resource[0] = res = &hose->io_resource; + if (res->flags && request_resource(&ioport_resource, res)) + printk(KERN_ERR "Failed to request PCI IO region " + "on PCI domain %04x\n", hose->global_number); + + for (i = 0; i < 3; ++i) { + res = &hose->mem_resources[i]; + bus->resource[i+1] = res; + if (res->flags && request_resource(&iomem_resource, res)) + printk(KERN_ERR "Failed to request PCI memory region " + "on PCI domain %04x\n", hose->global_number); + } + + mode = PCI_PROBE_NORMAL; +#ifdef CONFIG_PPC_MULTIPLATFORM + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + if (mode == PCI_PROBE_DEVTREE) { + bus->subordinate = hose->last_busno; + of_scan_bus(node, bus); + } +#endif /* CONFIG_PPC_MULTIPLATFORM */ + if (mode == PCI_PROBE_NORMAL) + hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); + pci_bus_add_devices(bus); +} + +static int __init pcibios_init(void) +{ + struct pci_controller *hose, *tmp; + + /* For now, override phys_mem_access_prot. If we need it, + * later, we may move that initialization to each ppc_md + */ + ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot; + +#ifdef CONFIG_PPC_ISERIES + iSeries_pcibios_init(); +#endif + + printk("PCI: Probing PCI hardware\n"); + + /* Scan all of the recorded PCI controllers. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + scan_phb(hose); + +#ifndef CONFIG_PPC_ISERIES + if (pci_probe_only) + pcibios_claim_of_setup(); + else + /* FIXME: `else' will be removed when + pci_assign_unassigned_resources() is able to work + correctly with [partially] allocated PCI tree. */ + pci_assign_unassigned_resources(); +#endif /* !CONFIG_PPC_ISERIES */ + + /* Call machine dependent final fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + + /* Cache the location of the ISA bridge (if we have one) */ + ppc64_isabridge_dev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); + if (ppc64_isabridge_dev != NULL) + printk("ISA bridge at %s\n", pci_name(ppc64_isabridge_dev)); + +#ifdef CONFIG_PPC_MULTIPLATFORM + /* map in PCI I/O space */ + phbs_remap_io(); +#endif + + printk("PCI: Probing PCI hardware done\n"); + + return 0; +} + +subsys_initcall(pcibios_init); + +char __init *pcibios_setup(char *str) +{ + return str; +} + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + u16 cmd, oldcmd; + int i; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + oldcmd = cmd; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *res = &dev->resource[i]; + + /* Only set up the requested stuff */ + if (!(mask & (1<<i))) + continue; + + if (res->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (res->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + + if (cmd != oldcmd) { + printk(KERN_DEBUG "PCI: Enabling device: (%s), cmd %x\n", + pci_name(dev), cmd); + /* Enable the appropriate bits in the PCI command register. */ + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + return 0; +} + +/* + * Return the domain number for this bus. + */ +int pci_domain_nr(struct pci_bus *bus) +{ +#ifdef CONFIG_PPC_ISERIES + return 0; +#else + struct pci_controller *hose = pci_bus_to_host(bus); + + return hose->global_number; +#endif +} + +EXPORT_SYMBOL(pci_domain_nr); + +/* Decide whether to display the domain number in /proc */ +int pci_proc_domain(struct pci_bus *bus) +{ +#ifdef CONFIG_PPC_ISERIES + return 0; +#else + struct pci_controller *hose = pci_bus_to_host(bus); + return hose->buid; +#endif +} + +/* + * Platform support for /proc/bus/pci/X/Y mmap()s, + * modelled on the sparc64 implementation by Dave Miller. + * -- paulus. + */ + +/* + * Adjust vm_pgoff of VMA such that it is the physical page offset + * corresponding to the 32-bit pci bus offset for DEV requested by the user. + * + * Basically, the user finds the base address for his device which he wishes + * to mmap. They read the 32-bit value from the config space base register, + * add whatever PAGE_SIZE multiple offset they wish, and feed this into the + * offset parameter of mmap on /proc/bus/pci/XXX for that device. + * + * Returns negative error code on failure, zero on success. + */ +static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, + unsigned long *offset, + enum pci_mmap_state mmap_state) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long io_offset = 0; + int i, res_bit; + + if (hose == 0) + return NULL; /* should never happen */ + + /* If memory, add on the PCI bridge address offset */ + if (mmap_state == pci_mmap_mem) { + *offset += hose->pci_mem_offset; + res_bit = IORESOURCE_MEM; + } else { + io_offset = (unsigned long)hose->io_base_virt - pci_io_base; + *offset += io_offset; + res_bit = IORESOURCE_IO; + } + + /* + * Check that the offset requested corresponds to one of the + * resources of the device. + */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &dev->resource[i]; + int flags = rp->flags; + + /* treat ROM as memory (should be already) */ + if (i == PCI_ROM_RESOURCE) + flags |= IORESOURCE_MEM; + + /* Active and same type? */ + if ((flags & res_bit) == 0) + continue; + + /* In the range of this resource? */ + if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end) + continue; + + /* found it! construct the final physical address */ + if (mmap_state == pci_mmap_io) + *offset += hose->io_base_phys - io_offset; + return rp; + } + + return NULL; +} + +/* + * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci + * device mapping. + */ +static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, + pgprot_t protection, + enum pci_mmap_state mmap_state, + int write_combine) +{ + unsigned long prot = pgprot_val(protection); + + /* Write combine is always 0 on non-memory space mappings. On + * memory space, if the user didn't pass 1, we check for a + * "prefetchable" resource. This is a bit hackish, but we use + * this to workaround the inability of /sysfs to provide a write + * combine bit + */ + if (mmap_state != pci_mmap_mem) + write_combine = 0; + else if (write_combine == 0) { + if (rp->flags & IORESOURCE_PREFETCH) + write_combine = 1; + } + + /* XXX would be nice to have a way to ask for write-through */ + prot |= _PAGE_NO_CACHE; + if (write_combine) + prot &= ~_PAGE_GUARDED; + else + prot |= _PAGE_GUARDED; + + printk("PCI map for %s:%lx, prot: %lx\n", pci_name(dev), rp->start, + prot); + + return __pgprot(prot); +} + +/* + * This one is used by /dev/mem and fbdev who have no clue about the + * PCI device, it tries to find the PCI device first and calls the + * above routine + */ +pgprot_t pci_phys_mem_access_prot(struct file *file, + unsigned long pfn, + unsigned long size, + pgprot_t protection) +{ + struct pci_dev *pdev = NULL; + struct resource *found = NULL; + unsigned long prot = pgprot_val(protection); + unsigned long offset = pfn << PAGE_SHIFT; + int i; + + if (page_is_ram(pfn)) + return __pgprot(prot); + + prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; + + for_each_pci_dev(pdev) { + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &pdev->resource[i]; + int flags = rp->flags; + + /* Active and same type? */ + if ((flags & IORESOURCE_MEM) == 0) + continue; + /* In the range of this resource? */ + if (offset < (rp->start & PAGE_MASK) || + offset > rp->end) + continue; + found = rp; + break; + } + if (found) + break; + } + if (found) { + if (found->flags & IORESOURCE_PREFETCH) + prot &= ~_PAGE_GUARDED; + pci_dev_put(pdev); + } + + DBG("non-PCI map for %lx, prot: %lx\n", offset, prot); + + return __pgprot(prot); +} + + +/* + * Perform the actual remap of the pages for a PCI device mapping, as + * appropriate for this architecture. The region in the process to map + * is described by vm_start and vm_end members of VMA, the base physical + * address is found in vm_pgoff. + * The pci device structure is provided so that architectures may make mapping + * decisions on a per-device or per-bus basis. + * + * Returns a negative error code on failure, zero on success. + */ +int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, + int write_combine) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + struct resource *rp; + int ret; + + rp = __pci_mmap_make_offset(dev, &offset, mmap_state); + if (rp == NULL) + return -EINVAL; + + vma->vm_pgoff = offset >> PAGE_SHIFT; + vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO; + vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp, + vma->vm_page_prot, + mmap_state, write_combine); + + ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + + return ret; +} + +#ifdef CONFIG_PPC_MULTIPLATFORM +static ssize_t pci_show_devspec(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pci_dev *pdev; + struct device_node *np; + + pdev = to_pci_dev (dev); + np = pci_device_to_OF_node(pdev); + if (np == NULL || np->full_name == NULL) + return 0; + return sprintf(buf, "%s", np->full_name); +} +static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + +void pcibios_add_platform_entries(struct pci_dev *pdev) +{ +#ifdef CONFIG_PPC_MULTIPLATFORM + device_create_file(&pdev->dev, &dev_attr_devspec); +#endif /* CONFIG_PPC_MULTIPLATFORM */ +} + +#ifdef CONFIG_PPC_MULTIPLATFORM + +#define ISA_SPACE_MASK 0x1 +#define ISA_SPACE_IO 0x1 + +static void __devinit pci_process_ISA_OF_ranges(struct device_node *isa_node, + unsigned long phb_io_base_phys, + void __iomem * phb_io_base_virt) +{ + struct isa_range *range; + unsigned long pci_addr; + unsigned int isa_addr; + unsigned int size; + int rlen = 0; + + range = (struct isa_range *) get_property(isa_node, "ranges", &rlen); + if (range == NULL || (rlen < sizeof(struct isa_range))) { + printk(KERN_ERR "no ISA ranges or unexpected isa range size," + "mapping 64k\n"); + __ioremap_explicit(phb_io_base_phys, + (unsigned long)phb_io_base_virt, + 0x10000, _PAGE_NO_CACHE | _PAGE_GUARDED); + return; + } + + /* From "ISA Binding to 1275" + * The ranges property is laid out as an array of elements, + * each of which comprises: + * cells 0 - 1: an ISA address + * cells 2 - 4: a PCI address + * (size depending on dev->n_addr_cells) + * cell 5: the size of the range + */ + if ((range->isa_addr.a_hi && ISA_SPACE_MASK) == ISA_SPACE_IO) { + isa_addr = range->isa_addr.a_lo; + pci_addr = (unsigned long) range->pci_addr.a_mid << 32 | + range->pci_addr.a_lo; + + /* Assume these are both zero */ + if ((pci_addr != 0) || (isa_addr != 0)) { + printk(KERN_ERR "unexpected isa to pci mapping: %s\n", + __FUNCTION__); + return; + } + + size = PAGE_ALIGN(range->size); + + __ioremap_explicit(phb_io_base_phys, + (unsigned long) phb_io_base_virt, + size, _PAGE_NO_CACHE | _PAGE_GUARDED); + } +} + +void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose, + struct device_node *dev, int prim) +{ + unsigned int *ranges, pci_space; + unsigned long size; + int rlen = 0; + int memno = 0; + struct resource *res; + int np, na = prom_n_addr_cells(dev); + unsigned long pci_addr, cpu_phys_addr; + + np = na + 5; + + /* From "PCI Binding to 1275" + * The ranges property is laid out as an array of elements, + * each of which comprises: + * cells 0 - 2: a PCI address + * cells 3 or 3+4: a CPU physical address + * (size depending on dev->n_addr_cells) + * cells 4+5 or 5+6: the size of the range + */ + rlen = 0; + hose->io_base_phys = 0; + ranges = (unsigned int *) get_property(dev, "ranges", &rlen); + while ((rlen -= np * sizeof(unsigned int)) >= 0) { + res = NULL; + pci_space = ranges[0]; + pci_addr = ((unsigned long)ranges[1] << 32) | ranges[2]; + + cpu_phys_addr = ranges[3]; + if (na >= 2) + cpu_phys_addr = (cpu_phys_addr << 32) | ranges[4]; + + size = ((unsigned long)ranges[na+3] << 32) | ranges[na+4]; + ranges += np; + if (size == 0) + continue; + + /* Now consume following elements while they are contiguous */ + while (rlen >= np * sizeof(unsigned int)) { + unsigned long addr, phys; + + if (ranges[0] != pci_space) + break; + addr = ((unsigned long)ranges[1] << 32) | ranges[2]; + phys = ranges[3]; + if (na >= 2) + phys = (phys << 32) | ranges[4]; + if (addr != pci_addr + size || + phys != cpu_phys_addr + size) + break; + + size += ((unsigned long)ranges[na+3] << 32) + | ranges[na+4]; + ranges += np; + rlen -= np * sizeof(unsigned int); + } + + switch ((pci_space >> 24) & 0x3) { + case 1: /* I/O space */ + hose->io_base_phys = cpu_phys_addr; + hose->pci_io_size = size; + + res = &hose->io_resource; + res->flags = IORESOURCE_IO; + res->start = pci_addr; + DBG("phb%d: IO 0x%lx -> 0x%lx\n", hose->global_number, + res->start, res->start + size - 1); + break; + case 2: /* memory space */ + memno = 0; + while (memno < 3 && hose->mem_resources[memno].flags) + ++memno; + + if (memno == 0) + hose->pci_mem_offset = cpu_phys_addr - pci_addr; + if (memno < 3) { + res = &hose->mem_resources[memno]; + res->flags = IORESOURCE_MEM; + res->start = cpu_phys_addr; + DBG("phb%d: MEM 0x%lx -> 0x%lx\n", hose->global_number, + res->start, res->start + size - 1); + } + break; + } + if (res != NULL) { + res->name = dev->full_name; + res->end = res->start + size - 1; + res->parent = NULL; + res->sibling = NULL; + res->child = NULL; + } + } +} + +void __init pci_setup_phb_io(struct pci_controller *hose, int primary) +{ + unsigned long size = hose->pci_io_size; + unsigned long io_virt_offset; + struct resource *res; + struct device_node *isa_dn; + + hose->io_base_virt = reserve_phb_iospace(size); + DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n", + hose->global_number, hose->io_base_phys, + (unsigned long) hose->io_base_virt); + + if (primary) { + pci_io_base = (unsigned long)hose->io_base_virt; + isa_dn = of_find_node_by_type(NULL, "isa"); + if (isa_dn) { + isa_io_base = pci_io_base; + pci_process_ISA_OF_ranges(isa_dn, hose->io_base_phys, + hose->io_base_virt); + of_node_put(isa_dn); + /* Allow all IO */ + io_page_mask = -1; + } + } + + io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base; + res = &hose->io_resource; + res->start += io_virt_offset; + res->end += io_virt_offset; +} + +void __devinit pci_setup_phb_io_dynamic(struct pci_controller *hose, + int primary) +{ + unsigned long size = hose->pci_io_size; + unsigned long io_virt_offset; + struct resource *res; + + hose->io_base_virt = __ioremap(hose->io_base_phys, size, + _PAGE_NO_CACHE | _PAGE_GUARDED); + DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n", + hose->global_number, hose->io_base_phys, + (unsigned long) hose->io_base_virt); + + if (primary) + pci_io_base = (unsigned long)hose->io_base_virt; + + io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base; + res = &hose->io_resource; + res->start += io_virt_offset; + res->end += io_virt_offset; +} + + +static int get_bus_io_range(struct pci_bus *bus, unsigned long *start_phys, + unsigned long *start_virt, unsigned long *size) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pci_bus_region region; + struct resource *res; + + if (bus->self) { + res = bus->resource[0]; + pcibios_resource_to_bus(bus->self, ®ion, res); + *start_phys = hose->io_base_phys + region.start; + *start_virt = (unsigned long) hose->io_base_virt + + region.start; + if (region.end > region.start) + *size = region.end - region.start + 1; + else { + printk("%s(): unexpected region 0x%lx->0x%lx\n", + __FUNCTION__, region.start, region.end); + return 1; + } + + } else { + /* Root Bus */ + res = &hose->io_resource; + *start_phys = hose->io_base_phys; + *start_virt = (unsigned long) hose->io_base_virt; + if (res->end > res->start) + *size = res->end - res->start + 1; + else { + printk("%s(): unexpected region 0x%lx->0x%lx\n", + __FUNCTION__, res->start, res->end); + return 1; + } + } + + return 0; +} + +int unmap_bus_range(struct pci_bus *bus) +{ + unsigned long start_phys; + unsigned long start_virt; + unsigned long size; + + if (!bus) { + printk(KERN_ERR "%s() expected bus\n", __FUNCTION__); + return 1; + } + + if (get_bus_io_range(bus, &start_phys, &start_virt, &size)) + return 1; + if (iounmap_explicit((void __iomem *) start_virt, size)) + return 1; + + return 0; +} +EXPORT_SYMBOL(unmap_bus_range); + +int remap_bus_range(struct pci_bus *bus) +{ + unsigned long start_phys; + unsigned long start_virt; + unsigned long size; + + if (!bus) { + printk(KERN_ERR "%s() expected bus\n", __FUNCTION__); + return 1; + } + + + if (get_bus_io_range(bus, &start_phys, &start_virt, &size)) + return 1; + printk("mapping IO %lx -> %lx, size: %lx\n", start_phys, start_virt, size); + if (__ioremap_explicit(start_phys, start_virt, size, + _PAGE_NO_CACHE | _PAGE_GUARDED)) + return 1; + + return 0; +} +EXPORT_SYMBOL(remap_bus_range); + +void phbs_remap_io(void) +{ + struct pci_controller *hose, *tmp; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + remap_bus_range(hose->bus); +} + +/* + * ppc64 can have multifunction devices that do not respond to function 0. + * In this case we must scan all functions. + * XXX this can go now, we use the OF device tree in all the + * cases that caused problems. -- paulus + */ +int pcibios_scan_all_fns(struct pci_bus *bus, int devfn) +{ + return 0; +} + +static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long start, end, mask, offset; + + if (res->flags & IORESOURCE_IO) { + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + start = res->start += offset; + end = res->end += offset; + + /* Need to allow IO access to pages that are in the + ISA range */ + if (start < MAX_ISA_PORT) { + if (end > MAX_ISA_PORT) + end = MAX_ISA_PORT; + + start >>= PAGE_SHIFT; + end >>= PAGE_SHIFT; + + /* get the range of pages for the map */ + mask = ((1 << (end+1)) - 1) ^ ((1 << start) - 1); + io_page_mask |= mask; + } + } else if (res->flags & IORESOURCE_MEM) { + res->start += hose->pci_mem_offset; + res->end += hose->pci_mem_offset; + } +} + +void __devinit pcibios_fixup_device_resources(struct pci_dev *dev, + struct pci_bus *bus) +{ + /* Update device resources. */ + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) + if (dev->resource[i].flags) + fixup_resource(&dev->resource[i], dev); +} +EXPORT_SYMBOL(pcibios_fixup_device_resources); + +static void __devinit do_bus_setup(struct pci_bus *bus) +{ + struct pci_dev *dev; + + ppc_md.iommu_bus_setup(bus); + + list_for_each_entry(dev, &bus->devices, bus_list) + ppc_md.iommu_dev_setup(dev); + + if (ppc_md.irq_bus_setup) + ppc_md.irq_bus_setup(bus); +} + +void __devinit pcibios_fixup_bus(struct pci_bus *bus) +{ + struct pci_dev *dev = bus->self; + + if (dev && pci_probe_only && + (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + /* This is a subordinate bridge */ + + pci_read_bridge_bases(bus); + pcibios_fixup_device_resources(dev, bus); + } + + do_bus_setup(bus); + + if (!pci_probe_only) + return; + + list_for_each_entry(dev, &bus->devices, bus_list) + if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI) + pcibios_fixup_device_resources(dev, bus); +} +EXPORT_SYMBOL(pcibios_fixup_bus); + +/* + * Reads the interrupt pin to determine if interrupt is use by card. + * If the interrupt is used, then gets the interrupt line from the + * openfirmware and sets it in the pci_dev and pci_config line. + */ +int pci_read_irq_line(struct pci_dev *pci_dev) +{ + u8 intpin; + struct device_node *node; + + pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &intpin); + if (intpin == 0) + return 0; + + node = pci_device_to_OF_node(pci_dev); + if (node == NULL) + return -1; + + if (node->n_intrs == 0) + return -1; + + pci_dev->irq = node->intrs[0].line; + + pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, pci_dev->irq); + + return 0; +} +EXPORT_SYMBOL(pci_read_irq_line); + +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + u64 *start, u64 *end) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long offset = 0; + + if (hose == NULL) + return; + + if (rsrc->flags & IORESOURCE_IO) + offset = pci_io_base - (unsigned long)hose->io_base_virt + + hose->io_base_phys; + + *start = rsrc->start + offset; + *end = rsrc->end + offset; +} + +#endif /* CONFIG_PPC_MULTIPLATFORM */ + + +#define IOBASE_BRIDGE_NUMBER 0 +#define IOBASE_MEMORY 1 +#define IOBASE_IO 2 +#define IOBASE_ISA_IO 3 +#define IOBASE_ISA_MEM 4 + +long sys_pciconfig_iobase(long which, unsigned long in_bus, + unsigned long in_devfn) +{ + struct pci_controller* hose; + struct list_head *ln; + struct pci_bus *bus = NULL; + struct device_node *hose_node; + + /* Argh ! Please forgive me for that hack, but that's the + * simplest way to get existing XFree to not lockup on some + * G5 machines... So when something asks for bus 0 io base + * (bus 0 is HT root), we return the AGP one instead. + */ + if (machine_is_compatible("MacRISC4")) + if (in_bus == 0) + in_bus = 0xf0; + + /* That syscall isn't quite compatible with PCI domains, but it's + * used on pre-domains setup. We return the first match + */ + + for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) { + bus = pci_bus_b(ln); + if (in_bus >= bus->number && in_bus < (bus->number + bus->subordinate)) + break; + bus = NULL; + } + if (bus == NULL || bus->sysdata == NULL) + return -ENODEV; + + hose_node = (struct device_node *)bus->sysdata; + hose = PCI_DN(hose_node)->phb; + + switch (which) { + case IOBASE_BRIDGE_NUMBER: + return (long)hose->first_busno; + case IOBASE_MEMORY: + return (long)hose->pci_mem_offset; + case IOBASE_IO: + return (long)hose->io_base_phys; + case IOBASE_ISA_IO: + return (long)isa_io_base; + case IOBASE_ISA_MEM: + return -EINVAL; + } + + return -EOPNOTSUPP; +} diff --git a/arch/powerpc/kernel/pci_direct_iommu.c b/arch/powerpc/kernel/pci_direct_iommu.c new file mode 100644 index 00000000000..e1a32f802c0 --- /dev/null +++ b/arch/powerpc/kernel/pci_direct_iommu.c @@ -0,0 +1,94 @@ +/* + * Support for DMA from PCI devices to main memory on + * machines without an iommu or with directly addressable + * RAM (typically a pmac with 2Gb of RAM or less) + * + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/pmac_feature.h> +#include <asm/abs_addr.h> +#include <asm/ppc-pci.h> + +static void *pci_direct_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret; + + ret = (void *)__get_free_pages(flag, get_order(size)); + if (ret != NULL) { + memset(ret, 0, size); + *dma_handle = virt_to_abs(ret); + } + return ret; +} + +static void pci_direct_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + +static dma_addr_t pci_direct_map_single(struct device *hwdev, void *ptr, + size_t size, enum dma_data_direction direction) +{ + return virt_to_abs(ptr); +} + +static void pci_direct_unmap_single(struct device *hwdev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction direction) +{ +} + +static int pci_direct_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + int i; + + for (i = 0; i < nents; i++, sg++) { + sg->dma_address = page_to_phys(sg->page) + sg->offset; + sg->dma_length = sg->length; + } + + return nents; +} + +static void pci_direct_unmap_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ +} + +static int pci_direct_dma_supported(struct device *dev, u64 mask) +{ + return mask < 0x100000000ull; +} + +void __init pci_direct_iommu_init(void) +{ + pci_dma_ops.alloc_coherent = pci_direct_alloc_coherent; + pci_dma_ops.free_coherent = pci_direct_free_coherent; + pci_dma_ops.map_single = pci_direct_map_single; + pci_dma_ops.unmap_single = pci_direct_unmap_single; + pci_dma_ops.map_sg = pci_direct_map_sg; + pci_dma_ops.unmap_sg = pci_direct_unmap_sg; + pci_dma_ops.dma_supported = pci_direct_dma_supported; +} diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c new file mode 100644 index 00000000000..12c4c9e9bbc --- /dev/null +++ b/arch/powerpc/kernel/pci_dn.c @@ -0,0 +1,230 @@ +/* + * pci_dn.c + * + * Copyright (C) 2001 Todd Inglett, IBM Corporation + * + * PCI manipulation via device_nodes. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/pSeries_reconfig.h> +#include <asm/ppc-pci.h> + +/* + * Traverse_func that inits the PCI fields of the device node. + * NOTE: this *must* be done before read/write config to the device. + */ +static void * __devinit update_dn_pci_info(struct device_node *dn, void *data) +{ + struct pci_controller *phb = data; + int *type = (int *)get_property(dn, "ibm,pci-config-space-type", NULL); + u32 *regs; + struct pci_dn *pdn; + + if (mem_init_done) + pdn = kmalloc(sizeof(*pdn), GFP_KERNEL); + else + pdn = alloc_bootmem(sizeof(*pdn)); + if (pdn == NULL) + return NULL; + memset(pdn, 0, sizeof(*pdn)); + dn->data = pdn; + pdn->node = dn; + pdn->phb = phb; + regs = (u32 *)get_property(dn, "reg", NULL); + if (regs) { + /* First register entry is addr (00BBSS00) */ + pdn->busno = (regs[0] >> 16) & 0xff; + pdn->devfn = (regs[0] >> 8) & 0xff; + } + + pdn->pci_ext_config_space = (type && *type == 1); + return NULL; +} + +/* + * Traverse a device tree stopping each PCI device in the tree. + * This is done depth first. As each node is processed, a "pre" + * function is called and the children are processed recursively. + * + * The "pre" func returns a value. If non-zero is returned from + * the "pre" func, the traversal stops and this value is returned. + * This return value is useful when using traverse as a method of + * finding a device. + * + * NOTE: we do not run the func for devices that do not appear to + * be PCI except for the start node which we assume (this is good + * because the start node is often a phb which may be missing PCI + * properties). + * We use the class-code as an indicator. If we run into + * one of these nodes we also assume its siblings are non-pci for + * performance. + */ +void *traverse_pci_devices(struct device_node *start, traverse_func pre, + void *data) +{ + struct device_node *dn, *nextdn; + void *ret; + + /* We started with a phb, iterate all childs */ + for (dn = start->child; dn; dn = nextdn) { + u32 *classp, class; + + nextdn = NULL; + classp = (u32 *)get_property(dn, "class-code", NULL); + class = classp ? *classp : 0; + + if (pre && ((ret = pre(dn, data)) != NULL)) + return ret; + + /* If we are a PCI bridge, go down */ + if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || + (class >> 8) == PCI_CLASS_BRIDGE_CARDBUS)) + /* Depth first...do children */ + nextdn = dn->child; + else if (dn->sibling) + /* ok, try next sibling instead. */ + nextdn = dn->sibling; + if (!nextdn) { + /* Walk up to next valid sibling. */ + do { + dn = dn->parent; + if (dn == start) + return NULL; + } while (dn->sibling == NULL); + nextdn = dn->sibling; + } + } + return NULL; +} + +/** + * pci_devs_phb_init_dynamic - setup pci devices under this PHB + * phb: pci-to-host bridge (top-level bridge connecting to cpu) + * + * This routine is called both during boot, (before the memory + * subsystem is set up, before kmalloc is valid) and during the + * dynamic lpar operation of adding a PHB to a running system. + */ +void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb) +{ + struct device_node * dn = (struct device_node *) phb->arch_data; + struct pci_dn *pdn; + + /* PHB nodes themselves must not match */ + update_dn_pci_info(dn, phb); + pdn = dn->data; + if (pdn) { + pdn->devfn = pdn->busno = -1; + pdn->phb = phb; + } + + /* Update dn->phb ptrs for new phb and children devices */ + traverse_pci_devices(dn, update_dn_pci_info, phb); +} + +/* + * Traversal func that looks for a <busno,devfcn> value. + * If found, the pci_dn is returned (thus terminating the traversal). + */ +static void *is_devfn_node(struct device_node *dn, void *data) +{ + int busno = ((unsigned long)data >> 8) & 0xff; + int devfn = ((unsigned long)data) & 0xff; + struct pci_dn *pci = dn->data; + + if (pci && (devfn == pci->devfn) && (busno == pci->busno)) + return dn; + return NULL; +} + +/* + * This is the "slow" path for looking up a device_node from a + * pci_dev. It will hunt for the device under its parent's + * phb and then update sysdata for a future fastpath. + * + * It may also do fixups on the actual device since this happens + * on the first read/write. + * + * Note that it also must deal with devices that don't exist. + * In this case it may probe for real hardware ("just in case") + * and add a device_node to the device tree if necessary. + * + */ +struct device_node *fetch_dev_dn(struct pci_dev *dev) +{ + struct device_node *orig_dn = dev->sysdata; + struct device_node *dn; + unsigned long searchval = (dev->bus->number << 8) | dev->devfn; + + dn = traverse_pci_devices(orig_dn, is_devfn_node, (void *)searchval); + if (dn) + dev->sysdata = dn; + return dn; +} +EXPORT_SYMBOL(fetch_dev_dn); + +static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) +{ + struct device_node *np = node; + struct pci_dn *pci = NULL; + int err = NOTIFY_OK; + + switch (action) { + case PSERIES_RECONFIG_ADD: + pci = np->parent->data; + if (pci) + update_dn_pci_info(np, pci->phb); + break; + default: + err = NOTIFY_DONE; + break; + } + return err; +} + +static struct notifier_block pci_dn_reconfig_nb = { + .notifier_call = pci_dn_reconfig_notifier, +}; + +/** + * pci_devs_phb_init - Initialize phbs and pci devs under them. + * + * This routine walks over all phb's (pci-host bridges) on the + * system, and sets up assorted pci-related structures + * (including pci info in the device node structs) for each + * pci device found underneath. This routine runs once, + * early in the boot sequence. + */ +void __init pci_devs_phb_init(void) +{ + struct pci_controller *phb, *tmp; + + /* This must be done first so the device nodes have valid pci info! */ + list_for_each_entry_safe(phb, tmp, &hose_list, list_node) + pci_devs_phb_init_dynamic(phb); + + pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb); +} diff --git a/arch/powerpc/kernel/pci_iommu.c b/arch/powerpc/kernel/pci_iommu.c new file mode 100644 index 00000000000..bdf15dbbf4f --- /dev/null +++ b/arch/powerpc/kernel/pci_iommu.c @@ -0,0 +1,128 @@ +/* + * arch/ppc64/kernel/pci_iommu.c + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup, new allocation schemes: + * Copyright (C) 2004 Olof Johansson, IBM Corporation + * + * Dynamic DMA mapping support, platform-independent parts. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> + +/* + * We can use ->sysdata directly and avoid the extra work in + * pci_device_to_OF_node since ->sysdata will have been initialised + * in the iommu init code for all devices. + */ +#define PCI_GET_DN(dev) ((struct device_node *)((dev)->sysdata)) + +static inline struct iommu_table *devnode_table(struct device *dev) +{ + struct pci_dev *pdev; + + if (!dev) { + pdev = ppc64_isabridge_dev; + if (!pdev) + return NULL; + } else + pdev = to_pci_dev(dev); + + return PCI_DN(PCI_GET_DN(pdev))->iommu_table; +} + + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +static void *pci_iommu_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + return iommu_alloc_coherent(devnode_table(hwdev), size, dma_handle, + flag); +} + +static void pci_iommu_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + iommu_free_coherent(devnode_table(hwdev), size, vaddr, dma_handle); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address of the buffer + * passed here is the kernel (virtual) address of the buffer. The buffer + * need not be page aligned, the dma_addr_t returned will point to the same + * byte within the page as vaddr. + */ +static dma_addr_t pci_iommu_map_single(struct device *hwdev, void *vaddr, + size_t size, enum dma_data_direction direction) +{ + return iommu_map_single(devnode_table(hwdev), vaddr, size, direction); +} + + +static void pci_iommu_unmap_single(struct device *hwdev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + iommu_unmap_single(devnode_table(hwdev), dma_handle, size, direction); +} + + +static int pci_iommu_map_sg(struct device *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + return iommu_map_sg(pdev, devnode_table(pdev), sglist, + nelems, direction); +} + +static void pci_iommu_unmap_sg(struct device *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + iommu_unmap_sg(devnode_table(pdev), sglist, nelems, direction); +} + +/* We support DMA to/from any memory page via the iommu */ +static int pci_iommu_dma_supported(struct device *dev, u64 mask) +{ + return 1; +} + +void pci_iommu_init(void) +{ + pci_dma_ops.alloc_coherent = pci_iommu_alloc_coherent; + pci_dma_ops.free_coherent = pci_iommu_free_coherent; + pci_dma_ops.map_single = pci_iommu_map_single; + pci_dma_ops.unmap_single = pci_iommu_unmap_single; + pci_dma_ops.map_sg = pci_iommu_map_sg; + pci_dma_ops.unmap_sg = pci_iommu_unmap_sg; + pci_dma_ops.dma_supported = pci_iommu_dma_supported; +} diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 6a5b468edb4..3bf968e7409 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -1368,6 +1368,7 @@ prom_n_addr_cells(struct device_node* np) /* No #address-cells property for the root node, default to 1 */ return 1; } +EXPORT_SYMBOL(prom_n_addr_cells); int prom_n_size_cells(struct device_node* np) @@ -1383,6 +1384,7 @@ prom_n_size_cells(struct device_node* np) /* No #size-cells property for the root node, default to 1 */ return 1; } +EXPORT_SYMBOL(prom_n_size_cells); /** * Work out the sense (active-low level / active-high edge) diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c new file mode 100644 index 00000000000..7b948662704 --- /dev/null +++ b/arch/powerpc/kernel/rtas-rtc.c @@ -0,0 +1,105 @@ +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <linux/rtc.h> +#include <linux/delay.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/time.h> + + +#define MAX_RTC_WAIT 5000 /* 5 sec */ +#define RTAS_CLOCK_BUSY (-2) +unsigned long __init rtas_get_boot_time(void) +{ + int ret[8]; + int error, wait_time; + unsigned long max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + wait_time = rtas_extended_busy_delay_time(error); + /* This is boot time so we spin. */ + udelay(wait_time*1000); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) { + printk(KERN_WARNING "error: reading the clock failed (%d)\n", + error); + return 0; + } + + return mktime(ret[0], ret[1], ret[2], ret[3], ret[4], ret[5]); +} + +/* NOTE: get_rtc_time will get an error if executed in interrupt context + * and if a delay is needed to read the clock. In this case we just + * silently return without updating rtc_tm. + */ +void rtas_get_rtc_time(struct rtc_time *rtc_tm) +{ + int ret[8]; + int error, wait_time; + unsigned long max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + if (in_interrupt() && printk_ratelimit()) { + memset(&rtc_tm, 0, sizeof(struct rtc_time)); + printk(KERN_WARNING "error: reading clock" + " would delay interrupt\n"); + return; /* delay not allowed */ + } + wait_time = rtas_extended_busy_delay_time(error); + msleep(wait_time); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) { + printk(KERN_WARNING "error: reading the clock failed (%d)\n", + error); + return; + } + + rtc_tm->tm_sec = ret[5]; + rtc_tm->tm_min = ret[4]; + rtc_tm->tm_hour = ret[3]; + rtc_tm->tm_mday = ret[2]; + rtc_tm->tm_mon = ret[1] - 1; + rtc_tm->tm_year = ret[0] - 1900; +} + +int rtas_set_rtc_time(struct rtc_time *tm) +{ + int error, wait_time; + unsigned long max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, + tm->tm_year + 1900, tm->tm_mon + 1, + tm->tm_mday, tm->tm_hour, tm->tm_min, + tm->tm_sec, 0); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + if (in_interrupt()) + return 1; /* probably decrementer */ + wait_time = rtas_extended_busy_delay_time(error); + msleep(wait_time); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) + printk(KERN_WARNING "error: setting the clock failed (%d)\n", + error); + + return 0; +} diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index c98cfcc9cd9..e5694335bf1 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -57,10 +57,6 @@ extern void power4_idle(void); boot_infos_t *boot_infos; struct ide_machdep_calls ppc_ide_md; -/* XXX should go elsewhere */ -int __irq_offset_value; -EXPORT_SYMBOL(__irq_offset_value); - int boot_cpuid; EXPORT_SYMBOL_GPL(boot_cpuid); int boot_cpuid_phys; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index fdbd9f9122f..608fee7c7e2 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -59,6 +59,7 @@ #include <asm/firmware.h> #include <asm/xmon.h> #include <asm/udbg.h> +#include <asm/kexec.h> #include "setup.h" @@ -415,6 +416,10 @@ void __init setup_system(void) */ unflatten_device_tree(); +#ifdef CONFIG_KEXEC + kexec_setup(); /* requires unflattened device tree. */ +#endif + /* * Fill the ppc64_caches & systemcfg structures with informations * retreived from the device-tree. Need to be called before diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 8bdf95b7e42..5a2eba60dd3 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -403,8 +403,6 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, ELF_NFPREG * sizeof(double))) return 1; - current->thread.fpscr.val = 0; /* turn off all fp exceptions */ - #ifdef CONFIG_ALTIVEC /* save altivec registers */ if (current->thread.used_vr) { @@ -818,6 +816,9 @@ static int handle_rt_signal(unsigned long sig, struct k_sigaction *ka, goto badframe; regs->link = (unsigned long) frame->tramp; } + + current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; regs->gpr[1] = newsp; @@ -1097,6 +1098,8 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka, regs->link = (unsigned long) frame->mctx.tramp; } + current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; regs->gpr[1] = newsp; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 58194e15071..1decf278553 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -131,9 +131,6 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, flush_fp_to_thread(current); - /* Make sure signal doesn't get spurrious FP exceptions */ - current->thread.fpscr.val = 0; - #ifdef CONFIG_ALTIVEC err |= __put_user(v_regs, &sc->v_regs); @@ -423,6 +420,9 @@ static int setup_rt_frame(int signr, struct k_sigaction *ka, siginfo_t *info, if (err) goto badframe; + /* Make sure signal handler doesn't get spurious FP exceptions */ + current->thread.fpscr.val = 0; + /* Set up to return from userspace. */ if (vdso64_rt_sigtramp && current->thread.vdso_base) { regs->link = current->thread.vdso_base + vdso64_rt_sigtramp; diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index a08c26e8783..f6b38472318 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -77,8 +77,9 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) mflr r12 .cfi_register lr,r12 bl __get_datapage@local - lwz r3,CFG_TB_TICKS_PER_SEC(r3) lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) + lwz r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 + blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index aeb5fc9b87b..0a32a41d50b 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -83,7 +83,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) /* Check for supported clock IDs */ cmpli cr0,r3,CLOCK_REALTIME cmpli cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f mflr r12 /* r12 saves lr */ @@ -91,7 +91,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mr r10,r3 /* r10 saves id */ mr r11,r4 /* r11 saves tp */ bl __get_datapage@local /* get data page */ - mr r9, r3 /* datapage ptr in r9 */ + mr r9,r3 /* datapage ptr in r9 */ beq cr1,50f /* if monotonic -> jump there */ /* @@ -173,10 +173,14 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) add r4,r4,r7 lis r5,NSEC_PER_SEC@h ori r5,r5,NSEC_PER_SEC@l - cmpli cr0,r4,r5 + cmpl cr0,r4,r5 + cmpli cr1,r4,0 blt 1f subf r4,r5,r4 addi r3,r3,1 +1: bge cr1,1f + addi r3,r3,-1 + add r4,r4,r5 1: stw r3,TSPC32_TV_SEC(r11) stw r4,TSPC32_TV_NSEC(r11) @@ -210,7 +214,7 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) /* Check for supported clock IDs */ cmpwi cr0,r3,CLOCK_REALTIME cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f li r3,0 diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S index e67eda0f8cd..6393e4137bc 100644 --- a/arch/powerpc/kernel/vdso64/datapage.S +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -80,5 +80,6 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) bl V_LOCAL_FUNC(__get_datapage) ld r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 + blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index d371c02a8c0..1a89094715c 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -1,4 +1,5 @@ -/* + + /* * Userland implementation of gettimeofday() for 64 bits processes in a * ppc64 kernel for use in the vDSO * @@ -68,7 +69,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) /* Check for supported clock IDs */ cmpwi cr0,r3,CLOCK_REALTIME cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f mflr r12 /* r12 saves lr */ @@ -84,16 +85,17 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - lis r7,0x3b9a /* r7 = 1000000000 = NSEC_PER_SEC */ - ori r7,r7,0xca00 + lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ + ori r7,r7,16960 rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ std r5,TSPC64_TV_SEC(r11) /* store sec in tv */ subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* nsec = (xsec * NSEC_PER_SEC) / + mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / * XSEC_PER_SEC */ rldicl r0,r0,44,20 + mulli r0,r0,1000 /* nsec = usec * 1000 */ std r0,TSPC64_TV_NSEC(r11) /* store nsec in tp */ mtlr r12 @@ -106,15 +108,16 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) 50: bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - lis r7,0x3b9a /* r7 = 1000000000 = NSEC_PER_SEC */ - ori r7,r7,0xca00 + lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ + ori r7,r7,16960 rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* nsec = (xsec * NSEC_PER_SEC) / + mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / * XSEC_PER_SEC */ rldicl r6,r0,44,20 + mulli r6,r6,1000 /* nsec = usec * 1000 */ /* now we must fixup using wall to monotonic. We need to snapshot * that value and do the counter trick again. Fortunately, we still @@ -123,8 +126,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) * can be used */ - lwz r4,WTOM_CLOCK_SEC(r9) - lwz r7,WTOM_CLOCK_NSEC(r9) + lwa r4,WTOM_CLOCK_SEC(r3) + lwa r7,WTOM_CLOCK_NSEC(r3) /* We now have our result in r4,r7. We create a fake dependency * on that result and re-check the counter @@ -144,10 +147,14 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) add r7,r7,r6 lis r9,NSEC_PER_SEC@h ori r9,r9,NSEC_PER_SEC@l - cmpli cr0,r7,r9 + cmpl cr0,r7,r9 + cmpli cr1,r7,0 blt 1f subf r7,r9,r7 addi r4,r4,1 +1: bge cr1,1f + addi r4,r4,-1 + add r7,r7,r9 1: std r4,TSPC64_TV_SEC(r11) std r7,TSPC64_TV_NSEC(r11) @@ -181,7 +188,7 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) /* Check for supported clock IDs */ cmpwi cr0,r3,CLOCK_REALTIME cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f li r3,0 diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c index 01090e9ce0c..a58daa15368 100644 --- a/arch/powerpc/platforms/iseries/irq.c +++ b/arch/powerpc/platforms/iseries/irq.c @@ -42,13 +42,6 @@ #include "irq.h" #include "call_pci.h" -/* This maps virtual irq numbers to real irqs */ -unsigned int virt_irq_to_real_map[NR_IRQS]; - -/* The next available virtual irq number */ -/* Note: the pcnet32 driver assumes irq numbers < 2 aren't valid. :( */ -static int next_virtual_irq = 2; - static long Pci_Interrupt_Count; static long Pci_Event_Count; @@ -350,26 +343,14 @@ static hw_irq_controller iSeries_IRQ_handler = { int __init iSeries_allocate_IRQ(HvBusNumber busNumber, HvSubBusNumber subBusNumber, HvAgentId deviceId) { - unsigned int realirq, virtirq; + int virtirq; + unsigned int realirq; u8 idsel = (deviceId >> 4); u8 function = deviceId & 7; - virtirq = next_virtual_irq++; realirq = ((busNumber - 1) << 6) + ((idsel - 1) << 3) + function; - virt_irq_to_real_map[virtirq] = realirq; + virtirq = virt_irq_create_mapping(realirq); irq_desc[virtirq].handler = &iSeries_IRQ_handler; return virtirq; } - -int virt_irq_create_mapping(unsigned int real_irq) -{ - BUG(); /* Don't call this on iSeries, yet */ - - return 0; -} - -void virt_irq_init(void) -{ - return; -} diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index 6a29f301436..da26639190d 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -39,7 +39,6 @@ #include <asm/sections.h> #include <asm/iommu.h> #include <asm/firmware.h> -#include <asm/systemcfg.h> #include <asm/system.h> #include <asm/time.h> #include <asm/paca.h> @@ -548,8 +547,6 @@ static unsigned long __init build_iSeries_Memory_Map(void) */ static void __init iSeries_setup_arch(void) { - unsigned procIx = get_paca()->lppaca.dyn_hv_phys_proc_index; - if (get_paca()->lppaca.shared_proc) { ppc_md.idle_loop = iseries_shared_idle; printk(KERN_INFO "Using shared processor idle loop\n"); @@ -565,9 +562,6 @@ static void __init iSeries_setup_arch(void) itVpdAreas.xSlicMaxLogicalProcs); printk("Max physical processors = %d\n", itVpdAreas.xSlicMaxPhysicalProcs); - - _systemcfg->processor = xIoHriProcessorVpd[procIx].xPVR; - printk("Processor version = %x\n", _systemcfg->processor); } static void iSeries_show_cpuinfo(struct seq_file *m) diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c index 5947b21a858..feb0a94e781 100644 --- a/arch/powerpc/platforms/powermac/time.c +++ b/arch/powerpc/platforms/powermac/time.c @@ -102,7 +102,7 @@ static unsigned long from_rtc_time(struct rtc_time *tm) static unsigned long cuda_get_time(void) { struct adb_request req; - unsigned long now; + unsigned int now; if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0) return 0; @@ -113,7 +113,7 @@ static unsigned long cuda_get_time(void) req.reply_len); now = (req.reply[3] << 24) + (req.reply[4] << 16) + (req.reply[5] << 8) + req.reply[6]; - return now - RTC_OFFSET; + return ((unsigned long)now) - RTC_OFFSET; } #define cuda_get_rtc_time(tm) to_rtc_time(cuda_get_time(), (tm)) @@ -146,7 +146,7 @@ static int cuda_set_rtc_time(struct rtc_time *tm) static unsigned long pmu_get_time(void) { struct adb_request req; - unsigned long now; + unsigned int now; if (pmu_request(&req, NULL, 1, PMU_READ_RTC) < 0) return 0; @@ -156,7 +156,7 @@ static unsigned long pmu_get_time(void) req.reply_len); now = (req.reply[0] << 24) + (req.reply[1] << 16) + (req.reply[2] << 8) + req.reply[3]; - return now - RTC_OFFSET; + return ((unsigned long)now) - RTC_OFFSET; } #define pmu_get_rtc_time(tm) to_rtc_time(pmu_get_time(), (tm)) @@ -199,6 +199,7 @@ static unsigned long smu_get_time(void) #define smu_set_rtc_time(tm, spin) 0 #endif +/* Can't be __init, it's called when suspending and resuming */ unsigned long pmac_get_boot_time(void) { /* Get the time from the RTC, used only at boot time */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index e7ca5b1f591..06d5ef50121 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -4,4 +4,7 @@ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_SCANLOG) += scanlog.o -obj-$(CONFIG_EEH) += eeh.o eeh_event.o +obj-$(CONFIG_EEH) += eeh.o eeh_event.o + +obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o +obj-$(CONFIG_HVCS) += hvcserver.o diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c new file mode 100644 index 00000000000..138e128a388 --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvconsole.c @@ -0,0 +1,74 @@ +/* + * hvconsole.c + * Copyright (C) 2004 Hollis Blanchard, IBM Corporation + * Copyright (C) 2004 IBM Corporation + * + * Additional Author(s): + * Ryan S. Arnold <rsa@us.ibm.com> + * + * LPAR console support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/hvcall.h> +#include <asm/hvconsole.h> + +/** + * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper + * @vtermno: The vtermno or unit_address of the adapter from which to fetch the + * data. + * @buf: The character buffer into which to put the character data fetched from + * firmware. + * @count: not used? + */ +int hvc_get_chars(uint32_t vtermno, char *buf, int count) +{ + unsigned long got; + + if (plpar_hcall(H_GET_TERM_CHAR, vtermno, 0, 0, 0, &got, + (unsigned long *)buf, (unsigned long *)buf+1) == H_Success) + return got; + return 0; +} + +EXPORT_SYMBOL(hvc_get_chars); + + +/** + * hvc_put_chars: send characters to firmware for denoted vterm adapter + * @vtermno: The vtermno or unit_address of the adapter from which the data + * originated. + * @buf: The character buffer that contains the character data to send to + * firmware. + * @count: Send this number of characters. + */ +int hvc_put_chars(uint32_t vtermno, const char *buf, int count) +{ + unsigned long *lbuf = (unsigned long *) buf; + long ret; + + ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0], + lbuf[1]); + if (ret == H_Success) + return count; + if (ret == H_Busy) + return 0; + return -EIO; +} + +EXPORT_SYMBOL(hvc_put_chars); diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c new file mode 100644 index 00000000000..4d584172055 --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvcserver.c @@ -0,0 +1,251 @@ +/* + * hvcserver.c + * Copyright (C) 2004 Ryan S Arnold, IBM Corporation + * + * PPC64 virtual I/O console server support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/slab.h> + +#include <asm/hvcall.h> +#include <asm/hvcserver.h> +#include <asm/io.h> + +#define HVCS_ARCH_VERSION "1.0.0" + +MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>"); +MODULE_DESCRIPTION("IBM hvcs ppc64 API"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(HVCS_ARCH_VERSION); + +/* + * Convert arch specific return codes into relevant errnos. The hvcs + * functions aren't performance sensitive, so this conversion isn't an + * issue. + */ +int hvcs_convert(long to_convert) +{ + switch (to_convert) { + case H_Success: + return 0; + case H_Parameter: + return -EINVAL; + case H_Hardware: + return -EIO; + case H_Busy: + case H_LongBusyOrder1msec: + case H_LongBusyOrder10msec: + case H_LongBusyOrder100msec: + case H_LongBusyOrder1sec: + case H_LongBusyOrder10sec: + case H_LongBusyOrder100sec: + return -EBUSY; + case H_Function: /* fall through */ + default: + return -EPERM; + } +} + +/** + * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info + * @head: list_head pointer for an allocated list of partner info structs to + * free. + * + * This function is used to free the partner info list that was returned by + * calling hvcs_get_partner_info(). + */ +int hvcs_free_partner_info(struct list_head *head) +{ + struct hvcs_partner_info *pi; + struct list_head *element; + + if (!head) + return -EINVAL; + + while (!list_empty(head)) { + element = head->next; + pi = list_entry(element, struct hvcs_partner_info, node); + list_del(element); + kfree(pi); + } + + return 0; +} +EXPORT_SYMBOL(hvcs_free_partner_info); + +/* Helper function for hvcs_get_partner_info */ +int hvcs_next_partner(uint32_t unit_address, + unsigned long last_p_partition_ID, + unsigned long last_p_unit_address, unsigned long *pi_buff) + +{ + long retval; + retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address, + last_p_partition_ID, + last_p_unit_address, virt_to_phys(pi_buff)); + return hvcs_convert(retval); +} + +/** + * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter + * @unit_address: The unit_address of the vty-server adapter for which this + * function is fetching partner info. + * @head: An initialized list_head pointer to an empty list to use to return the + * list of partner info fetched from the hypervisor to the caller. + * @pi_buff: A page sized buffer pre-allocated prior to calling this function + * that is to be used to be used by firmware as an iterator to keep track + * of the partner info retrieval. + * + * This function returns non-zero on success, or if there is no partner info. + * + * The pi_buff is pre-allocated prior to calling this function because this + * function may be called with a spin_lock held and kmalloc of a page is not + * recommended as GFP_ATOMIC. + * + * The first long of this buffer is used to store a partner unit address. The + * second long is used to store a partner partition ID and starting at + * pi_buff[2] is the 79 character Converged Location Code (diff size than the + * unsigned longs, hence the casting mumbo jumbo you see later). + * + * Invocation of this function should always be followed by an invocation of + * hvcs_free_partner_info() using a pointer to the SAME list head instance + * that was passed as a parameter to this function. + */ +int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head, + unsigned long *pi_buff) +{ + /* + * Dealt with as longs because of the hcall interface even though the + * values are uint32_t. + */ + unsigned long last_p_partition_ID; + unsigned long last_p_unit_address; + struct hvcs_partner_info *next_partner_info = NULL; + int more = 1; + int retval; + + memset(pi_buff, 0x00, PAGE_SIZE); + /* invalid parameters */ + if (!head || !pi_buff) + return -EINVAL; + + last_p_partition_ID = last_p_unit_address = ~0UL; + INIT_LIST_HEAD(head); + + do { + retval = hvcs_next_partner(unit_address, last_p_partition_ID, + last_p_unit_address, pi_buff); + if (retval) { + /* + * Don't indicate that we've failed if we have + * any list elements. + */ + if (!list_empty(head)) + return 0; + return retval; + } + + last_p_partition_ID = pi_buff[0]; + last_p_unit_address = pi_buff[1]; + + /* This indicates that there are no further partners */ + if (last_p_partition_ID == ~0UL + && last_p_unit_address == ~0UL) + break; + + /* This is a very small struct and will be freed soon in + * hvcs_free_partner_info(). */ + next_partner_info = kmalloc(sizeof(struct hvcs_partner_info), + GFP_ATOMIC); + + if (!next_partner_info) { + printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to" + " allocate partner info struct.\n"); + hvcs_free_partner_info(head); + return -ENOMEM; + } + + next_partner_info->unit_address + = (unsigned int)last_p_unit_address; + next_partner_info->partition_ID + = (unsigned int)last_p_partition_ID; + + /* copy the Null-term char too */ + strncpy(&next_partner_info->location_code[0], + (char *)&pi_buff[2], + strlen((char *)&pi_buff[2]) + 1); + + list_add_tail(&(next_partner_info->node), head); + next_partner_info = NULL; + + } while (more); + + return 0; +} +EXPORT_SYMBOL(hvcs_get_partner_info); + +/** + * hvcs_register_connection - establish a connection between this vty-server and + * a vty. + * @unit_address: The unit address of the vty-server adapter that is to be + * establish a connection. + * @p_partition_ID: The partition ID of the vty adapter that is to be connected. + * @p_unit_address: The unit address of the vty adapter to which the vty-server + * is to be connected. + * + * If this function is called once and -EINVAL is returned it may + * indicate that the partner info needs to be refreshed for the + * target unit address at which point the caller must invoke + * hvcs_get_partner_info() and then call this function again. If, + * for a second time, -EINVAL is returned then it indicates that + * there is probably already a partner connection registered to a + * different vty-server adapter. It is also possible that a second + * -EINVAL may indicate that one of the parms is not valid, for + * instance if the link was removed between the vty-server adapter + * and the vty adapter that you are trying to open. Don't shoot the + * messenger. Firmware implemented it this way. + */ +int hvcs_register_connection( uint32_t unit_address, + uint32_t p_partition_ID, uint32_t p_unit_address) +{ + long retval; + retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address, + p_partition_ID, p_unit_address); + return hvcs_convert(retval); +} +EXPORT_SYMBOL(hvcs_register_connection); + +/** + * hvcs_free_connection - free the connection between a vty-server and vty + * @unit_address: The unit address of the vty-server that is to have its + * connection severed. + * + * This function is used to free the partner connection between a vty-server + * adapter and a vty adapter. + * + * If -EBUSY is returned continue to call this function until 0 is returned. + */ +int hvcs_free_connection(uint32_t unit_address) +{ + long retval; + retval = plpar_hcall_norets(H_FREE_VTERM, unit_address); + return hvcs_convert(retval); +} +EXPORT_SYMBOL(hvcs_free_connection); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 31990829310..b9d9732b2e0 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -200,14 +200,12 @@ static void __init pSeries_setup_arch(void) if (ppc64_interrupt_controller == IC_OPEN_PIC) { ppc_md.init_IRQ = pSeries_init_mpic; ppc_md.get_irq = mpic_get_irq; - ppc_md.cpu_irq_down = mpic_teardown_this_cpu; /* Allocate the mpic now, so that find_and_init_phbs() can * fill the ISUs */ pSeries_setup_mpic(); } else { ppc_md.init_IRQ = xics_init_IRQ; ppc_md.get_irq = xics_get_irq; - ppc_md.cpu_irq_down = xics_teardown_cpu; } #ifdef CONFIG_SMP @@ -595,6 +593,27 @@ static int pSeries_pci_probe_mode(struct pci_bus *bus) return PCI_PROBE_NORMAL; } +#ifdef CONFIG_KEXEC +static void pseries_kexec_cpu_down(int crash_shutdown, int secondary) +{ + /* Don't risk a hypervisor call if we're crashing */ + if (!crash_shutdown) { + unsigned long vpa = __pa(&get_paca()->lppaca); + + if (unregister_vpa(hard_smp_processor_id(), vpa)) { + printk("VPA deregistration of cpu %u (hw_cpu_id %d) " + "failed\n", smp_processor_id(), + hard_smp_processor_id()); + } + } + + if (ppc64_interrupt_controller == IC_OPEN_PIC) + mpic_teardown_this_cpu(secondary); + else + xics_teardown_cpu(secondary); +} +#endif + struct machdep_calls __initdata pSeries_md = { .probe = pSeries_probe, .setup_arch = pSeries_setup_arch, @@ -617,4 +636,7 @@ struct machdep_calls __initdata pSeries_md = { .check_legacy_ioport = pSeries_check_legacy_ioport, .system_reset_exception = pSeries_system_reset_exception, .machine_check_exception = pSeries_machine_check_exception, +#ifdef CONFIG_KEXEC + .kexec_cpu_down = pseries_kexec_cpu_down, +#endif }; |