From d7aacaddcac3971e33cf52d7e610c06696cb347f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 8 Jul 2009 13:21:31 +0200 Subject: Driver Core: Add platform device arch data V3 Allow architecture specific data in struct platform_device V3. With this patch struct pdev_archdata is added to struct platform_device, similar to struct dev_archdata in found in struct device. Useful for architecture code that needs to keep extra data associated with each platform device. Struct pdev_archdata is different from dev.platform_data, the convention is that dev.platform_data points to driver-specific data. It may or may not be required by the driver. The format of this depends on driver but is the same across architectures. The structure pdev_archdata is a place for architecture specific data. This data is handled by architecture specific code (for example runtime PM), and since it is architecture specific it should _never_ be touched by device driver code. Exactly like struct dev_archdata but for platform devices. [rjw: This change is for power management mostly and that's why it goes through the suspend tree.] Signed-off-by: Magnus Damm Acked-by: Kevin Hilman Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- arch/powerpc/include/asm/device.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 7d2277cef09..e3e06e0f7fc 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -30,4 +30,7 @@ dev_archdata_get_node(const struct dev_archdata *ad) return ad->of_node; } +struct pdev_archdata { +}; + #endif /* _ASM_POWERPC_DEVICE_H */ -- cgit v1.2.3-70-g09d2 From 6a12235c7d2d75c7d94b9afcaaecd422ff845ce0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 10:25:58 +0100 Subject: agp: kill phys_to_gart() and gart_to_phys() There seems to be no reason for these -- they're a 1:1 mapping on all platforms. Signed-off-by: David Woodhouse --- arch/alpha/include/asm/agp.h | 4 ---- arch/ia64/include/asm/agp.h | 4 ---- arch/parisc/include/asm/agp.h | 4 ---- arch/powerpc/include/asm/agp.h | 4 ---- arch/sparc/include/asm/agp.h | 4 ---- arch/x86/include/asm/agp.h | 4 ---- drivers/char/agp/agp.h | 3 --- drivers/char/agp/ali-agp.c | 4 ++-- drivers/char/agp/amd-k7-agp.c | 8 ++++---- drivers/char/agp/amd64-agp.c | 6 +++--- drivers/char/agp/ati-agp.c | 6 +++--- drivers/char/agp/backend.c | 2 +- drivers/char/agp/efficeon-agp.c | 4 ++-- drivers/char/agp/generic.c | 6 +++--- drivers/char/agp/hp-agp.c | 4 ++-- drivers/char/agp/i460-agp.c | 4 ++-- drivers/char/agp/intel-agp.c | 7 +++---- drivers/char/agp/nvidia-agp.c | 2 +- drivers/char/agp/sgi-agp.c | 2 +- drivers/char/agp/sworks-agp.c | 8 ++++---- drivers/char/agp/uninorth-agp.c | 2 +- 21 files changed, 32 insertions(+), 60 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h index 26c17913529..a94d48b8677 100644 --- a/arch/alpha/include/asm/agp.h +++ b/arch/alpha/include/asm/agp.h @@ -9,10 +9,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/ia64/include/asm/agp.h b/arch/ia64/include/asm/agp.h index c11fdd8ab4d..01d09c401c5 100644 --- a/arch/ia64/include/asm/agp.h +++ b/arch/ia64/include/asm/agp.h @@ -17,10 +17,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/parisc/include/asm/agp.h b/arch/parisc/include/asm/agp.h index 9651660da63..d226ffa8fc1 100644 --- a/arch/parisc/include/asm/agp.h +++ b/arch/parisc/include/asm/agp.h @@ -11,10 +11,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/powerpc/include/asm/agp.h b/arch/powerpc/include/asm/agp.h index 86455c4c31e..416e12c2d50 100644 --- a/arch/powerpc/include/asm/agp.h +++ b/arch/powerpc/include/asm/agp.h @@ -8,10 +8,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/sparc/include/asm/agp.h b/arch/sparc/include/asm/agp.h index c2456870b05..70f52c1661b 100644 --- a/arch/sparc/include/asm/agp.h +++ b/arch/sparc/include/asm/agp.h @@ -7,10 +7,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 9825cd64c9b..eec2a70d437 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -22,10 +22,6 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index 4c6e5079d87..d6f36c004d9 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -318,9 +318,6 @@ void agp3_generic_cleanup(void); #define AGP_GENERIC_SIZES_ENTRIES 11 extern const struct aper_size_info_16 agp3_generic_sizes[]; -#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x))) -#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x))) - extern int agp_off; extern int agp_try_unsupported_boot; diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c index 201ef3ffd48..d2ce68f27e4 100644 --- a/drivers/char/agp/ali-agp.c +++ b/drivers/char/agp/ali-agp.c @@ -152,7 +152,7 @@ static struct page *m1541_alloc_page(struct agp_bridge_data *bridge) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN )); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN )); return page; } @@ -180,7 +180,7 @@ static void m1541_destroy_page(struct page *page, int flags) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN)); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN)); } agp_generic_destroy_page(page, flags); } diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c index 542a87895ae..73dbf40c874 100644 --- a/drivers/char/agp/amd-k7-agp.c +++ b/drivers/char/agp/amd-k7-agp.c @@ -44,7 +44,7 @@ static int amd_create_page_map(struct amd_page_map *page_map) #ifndef CONFIG_X86 SetPageReserved(virt_to_page(page_map->real)); global_cache_flush(); - page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), + page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), PAGE_SIZE); if (page_map->remapped == NULL) { ClearPageReserved(virt_to_page(page_map->real)); @@ -160,7 +160,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -173,7 +173,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } @@ -325,7 +325,7 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type) addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_generic_mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); readl(cur_gatt+GET_GATT_OFF(addr)); /* PCI Posting. */ diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index e85a5b3e952..2fb2e6cc322 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -79,7 +79,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { tmp = agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type); BUG_ON(tmp & 0xffffff0000000ffcULL); @@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] = static int amd_8151_configure(void) { - unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real); + unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); int i; /* Configure AGP regs in each x86-64 host bridge. */ @@ -558,7 +558,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev) { struct agp_bridge_data *bridge = pci_get_drvdata(pdev); - release_mem_region(virt_to_gart(bridge->gatt_table_real), + release_mem_region(virt_to_phys(bridge->gatt_table_real), amd64_aperture_sizes[bridge->aperture_size_idx].size); agp_remove_bridge(bridge); agp_put_bridge(bridge); diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index 59ebd60c1b6..3b2ecbe86eb 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -302,7 +302,7 @@ static int ati_insert_memory(struct agp_memory * mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } @@ -360,7 +360,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Write out the size register */ current_size = A_SIZE_LVL2(agp_bridge->current_size); @@ -390,7 +390,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 343f102090a..ad87753f6de 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -159,7 +159,7 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) goto err_out_nounmap; } } else { - bridge->scratch_page_dma = phys_to_gart(page_to_phys(page)); + bridge->scratch_page_dma = page_to_phys(page); } bridge->scratch_page = bridge->driver->mask_memory(bridge, diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index 35d50f2861b..793f39ea961 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c @@ -67,7 +67,7 @@ static const struct gatt_mask efficeon_generic_masks[] = /* This function does the same thing as mask_memory() for this chipset... */ static inline unsigned long efficeon_mask_memory(struct page *page) { - unsigned long addr = phys_to_gart(page_to_phys(page)); + unsigned long addr = page_to_phys(page); return addr | 0x00000001; } @@ -226,7 +226,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge) efficeon_private.l1_table[index] = page; - value = virt_to_gart((unsigned long *)page) | pati | present | index; + value = virt_to_phys((unsigned long *)page) | pati | present | index; pci_write_config_dword(agp_bridge->dev, EFFICEON_ATTPAGE, value); diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index 28f0208c66a..c50543966eb 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -988,7 +988,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) set_memory_uc((unsigned long)table, 1 << page_order); bridge->gatt_table = (void *)table; #else - bridge->gatt_table = ioremap_nocache(virt_to_gart(table), + bridge->gatt_table = ioremap_nocache(virt_to_phys(table), (PAGE_SIZE * (1 << page_order))); bridge->driver->cache_flush(); #endif @@ -1001,7 +1001,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) return -ENOMEM; } - bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real); + bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real); /* AK: bogus, should encode addresses > 4GB */ for (i = 0; i < num_entries; i++) { @@ -1142,7 +1142,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type), bridge->gatt_table+j); } diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c index 64dbf4b1cf2..501e293e5ad 100644 --- a/drivers/char/agp/hp-agp.c +++ b/drivers/char/agp/hp-agp.c @@ -107,7 +107,7 @@ static int __init hp_zx1_ioc_shared(void) hp->gart_size = HP_ZX1_GART_SIZE; hp->gatt_entries = hp->gart_size / hp->io_page_size; - hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); + hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)]; if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) { @@ -246,7 +246,7 @@ hp_zx1_configure (void) agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS); if (hp->io_pdir_owner) { - writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); + writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); readl(hp->ioc_regs+HP_ZX1_PDIR_BASE); writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG); readl(hp->ioc_regs+HP_ZX1_TCNFG); diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c index 54191f86053..e763d3312ce 100644 --- a/drivers/char/agp/i460-agp.c +++ b/drivers/char/agp/i460-agp.c @@ -325,7 +325,7 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem, io_page_size = 1UL << I460_IO_PAGE_SHIFT; for (i = 0, j = io_pg_start; i < mem->page_count; i++) { - paddr = phys_to_gart(page_to_phys(mem->pages[i])); + paddr = page_to_phys(mem->pages[i]); for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size) WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type)); } @@ -382,7 +382,7 @@ static int i460_alloc_large_page (struct lp_desc *lp) return -ENOMEM; } - lp->paddr = phys_to_gart(page_to_phys(lp->page)); + lp->paddr = page_to_phys(lp->page); lp->refcount = 0; atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); return 0; diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index d8c80d8be5e..aa8889e8afc 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -288,7 +288,7 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.gtt+j); } @@ -470,8 +470,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, global_cache_flush(); for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), - mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); @@ -977,7 +976,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index cedacee30ec..7e36d2b4f9d 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), agp_bridge->gatt_table+nvidia_private.pg_offset+j); } diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c index 0d47fa84740..0d426ae39c8 100644 --- a/drivers/char/agp/sgi-agp.c +++ b/drivers/char/agp/sgi-agp.c @@ -190,7 +190,7 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { table[j] = bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type); } diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c index 07259952fc3..13acaaf64ed 100644 --- a/drivers/char/agp/sworks-agp.c +++ b/drivers/char/agp/sworks-agp.c @@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Create a fake scratch directory */ for (i = 0; i < 1024; i++) { writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i); - writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); } retval = serverworks_create_gatt_pages(value->num_entries / 1024); @@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++) - writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); return 0; } @@ -350,7 +350,7 @@ static int serverworks_insert_memory(struct agp_memory *mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = SVRWRKS_GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mem->type), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } serverworks_tlbflush(mem); diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index f192c3b9ad4..2e993112ab8 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -431,7 +431,7 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge) bridge->gatt_table_real = (u32 *) table; bridge->gatt_table = (u32 *)table; - bridge->gatt_bus_addr = virt_to_gart(table); + bridge->gatt_bus_addr = virt_to_phys(table); for (i = 0; i < num_entries; i++) bridge->gatt_table[i] = 0; -- cgit v1.2.3-70-g09d2 From a42548a18866e87092db93b771e6c5b060d78401 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:29 +0200 Subject: cputime: Optimize jiffies_to_cputime(1) For powerpc with CONFIG_VIRT_CPU_ACCOUNTING jiffies_to_cputime(1) is not compile time constant and run time calculations are quite expensive. To optimize we use precomputed value. For all other architectures is is preprocessor definition. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-5-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/cputime.h | 1 + arch/powerpc/include/asm/cputime.h | 13 +++++++++++++ arch/powerpc/kernel/time.c | 4 ++++ arch/s390/include/asm/cputime.h | 1 + include/asm-generic/cputime.h | 1 + kernel/itimer.c | 4 ++-- kernel/posix-cpu-timers.c | 6 +++--- kernel/sched.c | 9 ++++----- 8 files changed, 29 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index d20b998cb91..7fa8a859466 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -30,6 +30,7 @@ typedef u64 cputime_t; typedef u64 cputime64_t; #define cputime_zero ((cputime_t)0) +#define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_max ((~((cputime_t)0) >> 1) - 1) #define cputime_add(__a, __b) ((__a) + (__b)) #define cputime_sub(__a, __b) ((__a) - (__b)) diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index f42e623030e..fa19f3fe05f 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -18,6 +18,9 @@ #ifndef CONFIG_VIRT_CPU_ACCOUNTING #include +#ifdef __KERNEL__ +static inline void setup_cputime_one_jiffy(void) { } +#endif #else #include @@ -48,6 +51,11 @@ typedef u64 cputime64_t; #ifdef __KERNEL__ +/* + * One jiffy in timebase units computed during initialization + */ +extern cputime_t cputime_one_jiffy; + /* * Convert cputime <-> jiffies */ @@ -89,6 +97,11 @@ static inline cputime_t jiffies_to_cputime(const unsigned long jif) return ct; } +static inline void setup_cputime_one_jiffy(void) +{ + cputime_one_jiffy = jiffies_to_cputime(1); +} + static inline cputime64_t jiffies64_to_cputime64(const u64 jif) { cputime_t ct; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511ceea..211d7b0cd37 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -193,6 +193,8 @@ EXPORT_SYMBOL(__cputime_clockt_factor); DEFINE_PER_CPU(unsigned long, cputime_last_delta); DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); +cputime_t cputime_one_jiffy; + static void calc_cputime_factors(void) { struct div_result res; @@ -500,6 +502,7 @@ static int __init iSeries_tb_recal(void) tb_to_xs = divres.result_low; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; vdso_data->tb_to_xs = tb_to_xs; + setup_cputime_one_jiffy(); } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -945,6 +948,7 @@ void __init time_init(void) tb_ticks_per_usec = ppc_tb_freq / 1000000; tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); calc_cputime_factors(); + setup_cputime_one_jiffy(); /* * Calculate the length of each tick in ns. It will not be diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 7a3817a656d..24b1244aadb 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -42,6 +42,7 @@ __div(unsigned long long n, unsigned int base) #endif /* __s390x__ */ #define cputime_zero (0ULL) +#define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_max ((~0UL >> 1) - 1) #define cputime_add(__a, __b) ((__a) + (__b)) #define cputime_sub(__a, __b) ((__a) - (__b)) diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 1c1fa422d18..ca0f239f0e1 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -7,6 +7,7 @@ typedef unsigned long cputime_t; #define cputime_zero (0UL) +#define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_max ((~0UL >> 1) - 1) #define cputime_add(__a, __b) ((__a) + (__b)) #define cputime_sub(__a, __b) ((__a) - (__b)) diff --git a/kernel/itimer.c b/kernel/itimer.c index 21adff7b2a1..8078a32d3b1 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -64,7 +64,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, if (cputime_le(cval, t)) /* about to fire */ - cval = jiffies_to_cputime(1); + cval = cputime_one_jiffy; else cval = cputime_sub(cval, t); } @@ -161,7 +161,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, if (!cputime_eq(cval, cputime_zero) || !cputime_eq(nval, cputime_zero)) { if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, jiffies_to_cputime(1)); + nval = cputime_add(nval, cputime_one_jiffy); set_process_cpu_timer(tsk, clock_id, &nval, &cval); } it->expires = nval; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 69c92374355..18bdde6f676 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1086,7 +1086,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, it->error += it->incr_error; if (it->error >= onecputick) { it->expires = cputime_sub(it->expires, - jiffies_to_cputime(1)); + cputime_one_jiffy); it->error -= onecputick; } } else @@ -1461,7 +1461,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); + *oldval = cputime_one_jiffy; } else { *oldval = cputime_sub(*oldval, now.cpu); } @@ -1712,7 +1712,7 @@ static __init int init_posix_cpu_timers(void) register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); - cputime_to_timespec(jiffies_to_cputime(1), &ts); + cputime_to_timespec(cputime_one_jiffy, &ts); onecputick = ts.tv_nsec; WARN_ON(ts.tv_sec != 0); diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273..8f977d5cc51 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5031,17 +5031,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* -- cgit v1.2.3-70-g09d2 From 0d2d3e38f72e400f602dade3f0ddffe0b3b9d4df Mon Sep 17 00:00:00 2001 From: Geoff Thorpe Date: Tue, 7 Jul 2009 15:23:56 +0000 Subject: powerpc: expose the multi-bit ops that underlie single-bit ops. The bitops.h functions that operate on a single bit in a bitfield are implemented by operating on the corresponding word location. In all cases the inner logic is valid if the mask being applied has more than one bit set, so this patch exposes those inner operations. Indeed, set_bits() was already available, but it duplicated code from set_bit() (rather than making the latter a wrapper) - it was also missing the PPC405_ERR77() workaround and the "volatile" address qualifier present in other APIs. This corrects that, and exposes the other multi-bit equivalents. One advantage of these multi-bit forms is that they allow word-sized variables to essentially be their own spinlocks, eg. very useful for state machines where an atomic "flags" variable can obviate the need for any additional locking. Signed-off-by: Geoff Thorpe Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/bitops.h | 196 ++++++++++++-------------------------- 1 file changed, 62 insertions(+), 134 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 897eade3afb..56f2f2ea563 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -56,174 +56,102 @@ #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) +/* Macro for generating the ***_bits() functions */ +#define DEFINE_BITOP(fn, op, prefix, postfix) \ +static __inline__ void fn(unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ + prefix \ +"1:" PPC_LLARX "%0,0,%3\n" \ + stringify_in_c(op) "%0,%0,%2\n" \ + PPC405_ERR77(0,%3) \ + PPC_STLCX "%0,0,%3\n" \ + "bne- 1b\n" \ + postfix \ + : "=&r" (old), "+m" (*p) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ +} + +DEFINE_BITOP(set_bits, or, "", "") +DEFINE_BITOP(clear_bits, andc, "", "") +DEFINE_BITOP(clear_bits_unlock, andc, LWSYNC_ON_SMP, "") +DEFINE_BITOP(change_bits, xor, "", "") + static __inline__ void set_bit(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # set_bit\n" - "or %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc" ); + set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)); } static __inline__ void clear_bit(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # clear_bit\n" - "andc %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc" ); + clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)); } static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # clear_bit_unlock\n" - "andc %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc", "memory"); + clear_bits_unlock(BITOP_MASK(nr), addr + BITOP_WORD(nr)); } static __inline__ void change_bit(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # change_bit\n" - "xor %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc" ); + change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)); +} + +/* Like DEFINE_BITOP(), with changes to the arguments to 'op' and the output + * operands. */ +#define DEFINE_TESTOP(fn, op, prefix, postfix) \ +static __inline__ unsigned long fn( \ + unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old, t; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ + prefix \ +"1:" PPC_LLARX "%0,0,%3\n" \ + stringify_in_c(op) "%1,%0,%2\n" \ + PPC405_ERR77(0,%3) \ + PPC_STLCX "%1,0,%3\n" \ + "bne- 1b\n" \ + postfix \ + : "=&r" (old), "=&r" (t) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ + return (old & mask); \ } +DEFINE_TESTOP(test_and_set_bits, or, LWSYNC_ON_SMP, ISYNC_ON_SMP) +DEFINE_TESTOP(test_and_set_bits_lock, or, "", ISYNC_ON_SMP) +DEFINE_TESTOP(test_and_clear_bits, andc, LWSYNC_ON_SMP, ISYNC_ON_SMP) +DEFINE_TESTOP(test_and_change_bits, xor, LWSYNC_ON_SMP, ISYNC_ON_SMP) + static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # test_and_set_bit\n" - "or %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; + return test_and_set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0; } static __inline__ int test_and_set_bit_lock(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # test_and_set_bit_lock\n" - "or %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; + return test_and_set_bits_lock(BITOP_MASK(nr), + addr + BITOP_WORD(nr)) != 0; } static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # test_and_clear_bit\n" - "andc %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; + return test_and_clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0; } static __inline__ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # test_and_change_bit\n" - "xor %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; -} - -static __inline__ void set_bits(unsigned long mask, unsigned long *addr) -{ - unsigned long old; - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # set_bits\n" - "or %0,%0,%2\n" - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*addr) - : "r" (mask), "r" (addr) - : "cc"); + return test_and_change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0; } #include -- cgit v1.2.3-70-g09d2 From 30d0b3682887a81f0335b42f20116fd40d743371 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jul 2009 20:53:51 +0000 Subject: powerpc: Move 64bit VDSO to improve context switch performance On 64bit applications the VDSO is the only thing in segment 0. Since the VDSO is position independent we can remove the hint and let get_unmapped_area pick an area. This will mean the vdso will be near other mmaps and will share an SLB entry: 10000000-10001000 r-xp 00000000 08:06 5778459 /root/context_switch_64 10010000-10011000 r--p 00000000 08:06 5778459 /root/context_switch_64 10011000-10012000 rw-p 00001000 08:06 5778459 /root/context_switch_64 fffa92ae000-fffa92b0000 rw-p 00000000 00:00 0 fffa92b0000-fffa9453000 r-xp 00000000 08:06 4334051 /lib64/power6/libc-2.9.so fffa9453000-fffa9462000 ---p 001a3000 08:06 4334051 /lib64/power6/libc-2.9.so fffa9462000-fffa9466000 r--p 001a2000 08:06 4334051 /lib64/power6/libc-2.9.so fffa9466000-fffa947c000 rw-p 001a6000 08:06 4334051 /lib64/power6/libc-2.9.so fffa947c000-fffa9480000 rw-p 00000000 00:00 0 fffa9480000-fffa94a8000 r-xp 00000000 08:06 4333852 /lib64/ld-2.9.so fffa94b3000-fffa94b4000 rw-p 00000000 00:00 0 fffa94b4000-fffa94b7000 r-xp 00000000 00:00 0 [vdso] <----- here I am fffa94b7000-fffa94b8000 r--p 00027000 08:06 4333852 /lib64/ld-2.9.so fffa94b8000-fffa94bb000 rw-p 00028000 08:06 4333852 /lib64/ld-2.9.so fffa94bb000-fffa94bc000 rw-p 00000000 00:00 0 fffe4c10000-fffe4c25000 rw-p 00000000 00:00 0 [stack] On a microbenchmark that bounces a token between two 64bit processes over pipes and calls gettimeofday each iteration (to access the VDSO), our context switch rate goes from 268k to 277k ctx switches/sec (tested on a 4GHz POWER6). Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/vdso.h | 3 +-- arch/powerpc/kernel/vdso.c | 7 ++++++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 26fc449bd98..dc0419b66f1 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -7,9 +7,8 @@ #define VDSO32_LBASE 0x100000 #define VDSO64_LBASE 0x100000 -/* Default map addresses */ +/* Default map addresses for 32bit vDSO */ #define VDSO32_MBASE VDSO32_LBASE -#define VDSO64_MBASE VDSO64_LBASE #define VDSO_VERSION_STRING LINUX_2.6.15 diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index ad06d5c75b1..a0abce251d0 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -203,7 +203,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } else { vdso_pagelist = vdso64_pagelist; vdso_pages = vdso64_pages; - vdso_base = VDSO64_MBASE; + /* + * On 64bit we don't have a preferred map address. This + * allows get_unmapped_area to find an area near other mmaps + * and most likely share a SLB entry. + */ + vdso_base = 0; } #else vdso_pagelist = vdso32_pagelist; -- cgit v1.2.3-70-g09d2 From 8aa34ab8b2dc96ca6c4feecfb87ed13f0d40ef98 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:52:52 +0000 Subject: powerpc: Rename exception.h to exception-64s.h The file include/asm/exception.h contains definitions that are specific to exception handling on 64-bit server type processors. This renames the file to exception-64s.h to reflect that fact and avoid confusion. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64s.h | 279 +++++++++++++++++++++++++++++ arch/powerpc/include/asm/exception.h | 279 ----------------------------- arch/powerpc/kernel/exceptions-64s.S | 2 + arch/powerpc/kernel/head_64.S | 1 - arch/powerpc/platforms/iseries/exception.h | 2 +- 5 files changed, 282 insertions(+), 281 deletions(-) create mode 100644 arch/powerpc/include/asm/exception-64s.h delete mode 100644 arch/powerpc/include/asm/exception.h (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h new file mode 100644 index 00000000000..d3d4534e3c7 --- /dev/null +++ b/arch/powerpc/include/asm/exception-64s.h @@ -0,0 +1,279 @@ +#ifndef _ASM_POWERPC_EXCEPTION_H +#define _ASM_POWERPC_EXCEPTION_H +/* + * Extracted from head_64.S + * + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP + * Copyright (C) 1996 Cort Dougan + * Adapted for Power Macintosh by Paul Mackerras. + * Low-level exception handlers and MMU support + * rewritten by Paul Mackerras. + * Copyright (C) 1996 Paul Mackerras. + * + * Adapted for 64bit PowerPC by Dave Engebretsen, Peter Bergner, and + * Mike Corrigan {engebret|bergner|mikejc}@us.ibm.com + * + * This file contains the low-level support and setup for the + * PowerPC-64 platform, including trap and interrupt dispatch. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* + * The following macros define the code that appears as + * the prologue to each of the exception handlers. They + * are split into two parts to allow a single kernel binary + * to be used for pSeries and iSeries. + * + * We make as much of the exception code common between native + * exception handlers (including pSeries LPAR) and iSeries LPAR + * implementations as possible. + */ + +#define EX_R9 0 +#define EX_R10 8 +#define EX_R11 16 +#define EX_R12 24 +#define EX_R13 32 +#define EX_SRR0 40 +#define EX_DAR 48 +#define EX_DSISR 56 +#define EX_CCR 60 +#define EX_R3 64 +#define EX_LR 72 + +/* + * We're short on space and time in the exception prolog, so we can't + * use the normal SET_REG_IMMEDIATE macro. Normally we just need the + * low halfword of the address, but for Kdump we need the whole low + * word. + */ +#define LOAD_HANDLER(reg, label) \ + addi reg,reg,(label)-_stext; /* virt addr of handler ... */ + +#define EXCEPTION_PROLOG_1(area) \ + mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ + std r9,area+EX_R9(r13); /* save r9 - r12 */ \ + std r10,area+EX_R10(r13); \ + std r11,area+EX_R11(r13); \ + std r12,area+EX_R12(r13); \ + mfspr r9,SPRN_SPRG1; \ + std r9,area+EX_R13(r13); \ + mfcr r9 + +#define EXCEPTION_PROLOG_PSERIES(area, label) \ + EXCEPTION_PROLOG_1(area); \ + ld r12,PACAKBASE(r13); /* get high part of &label */ \ + ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ + mfspr r11,SPRN_SRR0; /* save SRR0 */ \ + LOAD_HANDLER(r12,label) \ + mtspr SPRN_SRR0,r12; \ + mfspr r12,SPRN_SRR1; /* and SRR1 */ \ + mtspr SPRN_SRR1,r10; \ + rfid; \ + b . /* prevent speculative execution */ + +/* + * The common exception prolog is used for all except a few exceptions + * such as a segment miss on a kernel address. We have to be prepared + * to take another exception from the point where we first touch the + * kernel stack onwards. + * + * On entry r13 points to the paca, r9-r13 are saved in the paca, + * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and + * SRR1, and relocation is on. + */ +#define EXCEPTION_PROLOG_COMMON(n, area) \ + andi. r10,r12,MSR_PR; /* See if coming from user */ \ + mr r10,r1; /* Save r1 */ \ + subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \ + beq- 1f; \ + ld r1,PACAKSAVE(r13); /* kernel stack to use */ \ +1: cmpdi cr1,r1,0; /* check if r1 is in userspace */ \ + bge- cr1,2f; /* abort if it is */ \ + b 3f; \ +2: li r1,(n); /* will be reloaded later */ \ + sth r1,PACA_TRAP_SAVE(r13); \ + b bad_stack; \ +3: std r9,_CCR(r1); /* save CR in stackframe */ \ + std r11,_NIP(r1); /* save SRR0 in stackframe */ \ + std r12,_MSR(r1); /* save SRR1 in stackframe */ \ + std r10,0(r1); /* make stack chain pointer */ \ + std r0,GPR0(r1); /* save r0 in stackframe */ \ + std r10,GPR1(r1); /* save r1 in stackframe */ \ + ACCOUNT_CPU_USER_ENTRY(r9, r10); \ + std r2,GPR2(r1); /* save r2 in stackframe */ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ + ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ + ld r10,area+EX_R10(r13); \ + std r9,GPR9(r1); \ + std r10,GPR10(r1); \ + ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \ + ld r10,area+EX_R12(r13); \ + ld r11,area+EX_R13(r13); \ + std r9,GPR11(r1); \ + std r10,GPR12(r1); \ + std r11,GPR13(r1); \ + ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ + mflr r9; /* save LR in stackframe */ \ + std r9,_LINK(r1); \ + mfctr r10; /* save CTR in stackframe */ \ + std r10,_CTR(r1); \ + lbz r10,PACASOFTIRQEN(r13); \ + mfspr r11,SPRN_XER; /* save XER in stackframe */ \ + std r10,SOFTE(r1); \ + std r11,_XER(r1); \ + li r9,(n)+1; \ + std r9,_TRAP(r1); /* set trap number */ \ + li r10,0; \ + ld r11,exception_marker@toc(r2); \ + std r10,RESULT(r1); /* clear regs->result */ \ + std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ + +/* + * Exception vectors. + */ +#define STD_EXCEPTION_PSERIES(n, label) \ + . = n; \ + .globl label##_pSeries; \ +label##_pSeries: \ + HMT_MEDIUM; \ + mtspr SPRN_SPRG1,r13; /* save r13 */ \ + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) + +#define HSTD_EXCEPTION_PSERIES(n, label) \ + . = n; \ + .globl label##_pSeries; \ +label##_pSeries: \ + HMT_MEDIUM; \ + mtspr SPRN_SPRG1,r20; /* save r20 */ \ + mfspr r20,SPRN_HSRR0; /* copy HSRR0 to SRR0 */ \ + mtspr SPRN_SRR0,r20; \ + mfspr r20,SPRN_HSRR1; /* copy HSRR0 to SRR0 */ \ + mtspr SPRN_SRR1,r20; \ + mfspr r20,SPRN_SPRG1; /* restore r20 */ \ + mtspr SPRN_SPRG1,r13; /* save r13 */ \ + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) + + +#define MASKABLE_EXCEPTION_PSERIES(n, label) \ + . = n; \ + .globl label##_pSeries; \ +label##_pSeries: \ + HMT_MEDIUM; \ + mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ + std r9,PACA_EXGEN+EX_R9(r13); /* save r9, r10 */ \ + std r10,PACA_EXGEN+EX_R10(r13); \ + lbz r10,PACASOFTIRQEN(r13); \ + mfcr r9; \ + cmpwi r10,0; \ + beq masked_interrupt; \ + mfspr r10,SPRN_SPRG1; \ + std r10,PACA_EXGEN+EX_R13(r13); \ + std r11,PACA_EXGEN+EX_R11(r13); \ + std r12,PACA_EXGEN+EX_R12(r13); \ + ld r12,PACAKBASE(r13); /* get high part of &label */ \ + ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ + mfspr r11,SPRN_SRR0; /* save SRR0 */ \ + LOAD_HANDLER(r12,label##_common) \ + mtspr SPRN_SRR0,r12; \ + mfspr r12,SPRN_SRR1; /* and SRR1 */ \ + mtspr SPRN_SRR1,r10; \ + rfid; \ + b . /* prevent speculative execution */ + +#ifdef CONFIG_PPC_ISERIES +#define DISABLE_INTS \ + li r11,0; \ + stb r11,PACASOFTIRQEN(r13); \ +BEGIN_FW_FTR_SECTION; \ + stb r11,PACAHARDIRQEN(r13); \ +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES); \ + TRACE_DISABLE_INTS; \ +BEGIN_FW_FTR_SECTION; \ + mfmsr r10; \ + ori r10,r10,MSR_EE; \ + mtmsrd r10,1; \ +END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) +#else +#define DISABLE_INTS \ + li r11,0; \ + stb r11,PACASOFTIRQEN(r13); \ + stb r11,PACAHARDIRQEN(r13); \ + TRACE_DISABLE_INTS +#endif /* CONFIG_PPC_ISERIES */ + +#define ENABLE_INTS \ + ld r12,_MSR(r1); \ + mfmsr r11; \ + rlwimi r11,r12,0,MSR_EE; \ + mtmsrd r11,1 + +#define STD_EXCEPTION_COMMON(trap, label, hdlr) \ + .align 7; \ + .globl label##_common; \ +label##_common: \ + EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ + DISABLE_INTS; \ + bl .save_nvgprs; \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + bl hdlr; \ + b .ret_from_except + +/* + * Like STD_EXCEPTION_COMMON, but for exceptions that can occur + * in the idle task and therefore need the special idle handling. + */ +#define STD_EXCEPTION_COMMON_IDLE(trap, label, hdlr) \ + .align 7; \ + .globl label##_common; \ +label##_common: \ + EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ + FINISH_NAP; \ + DISABLE_INTS; \ + bl .save_nvgprs; \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + bl hdlr; \ + b .ret_from_except + +#define STD_EXCEPTION_COMMON_LITE(trap, label, hdlr) \ + .align 7; \ + .globl label##_common; \ +label##_common: \ + EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ + FINISH_NAP; \ + DISABLE_INTS; \ +BEGIN_FTR_SECTION \ + bl .ppc64_runlatch_on; \ +END_FTR_SECTION_IFSET(CPU_FTR_CTRL) \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + bl hdlr; \ + b .ret_from_except_lite + +/* + * When the idle code in power4_idle puts the CPU into NAP mode, + * it has to do so in a loop, and relies on the external interrupt + * and decrementer interrupt entry code to get it out of the loop. + * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags + * to signal that it is in the loop and needs help to get out. + */ +#ifdef CONFIG_PPC_970_NAP +#define FINISH_NAP \ +BEGIN_FTR_SECTION \ + clrrdi r11,r1,THREAD_SHIFT; \ + ld r9,TI_LOCAL_FLAGS(r11); \ + andi. r10,r9,_TLF_NAPPING; \ + bnel power4_fixup_nap; \ +END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) +#else +#define FINISH_NAP +#endif + +#endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/exception.h b/arch/powerpc/include/asm/exception.h deleted file mode 100644 index d3d4534e3c7..00000000000 --- a/arch/powerpc/include/asm/exception.h +++ /dev/null @@ -1,279 +0,0 @@ -#ifndef _ASM_POWERPC_EXCEPTION_H -#define _ASM_POWERPC_EXCEPTION_H -/* - * Extracted from head_64.S - * - * PowerPC version - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP - * Copyright (C) 1996 Cort Dougan - * Adapted for Power Macintosh by Paul Mackerras. - * Low-level exception handlers and MMU support - * rewritten by Paul Mackerras. - * Copyright (C) 1996 Paul Mackerras. - * - * Adapted for 64bit PowerPC by Dave Engebretsen, Peter Bergner, and - * Mike Corrigan {engebret|bergner|mikejc}@us.ibm.com - * - * This file contains the low-level support and setup for the - * PowerPC-64 platform, including trap and interrupt dispatch. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -/* - * The following macros define the code that appears as - * the prologue to each of the exception handlers. They - * are split into two parts to allow a single kernel binary - * to be used for pSeries and iSeries. - * - * We make as much of the exception code common between native - * exception handlers (including pSeries LPAR) and iSeries LPAR - * implementations as possible. - */ - -#define EX_R9 0 -#define EX_R10 8 -#define EX_R11 16 -#define EX_R12 24 -#define EX_R13 32 -#define EX_SRR0 40 -#define EX_DAR 48 -#define EX_DSISR 56 -#define EX_CCR 60 -#define EX_R3 64 -#define EX_LR 72 - -/* - * We're short on space and time in the exception prolog, so we can't - * use the normal SET_REG_IMMEDIATE macro. Normally we just need the - * low halfword of the address, but for Kdump we need the whole low - * word. - */ -#define LOAD_HANDLER(reg, label) \ - addi reg,reg,(label)-_stext; /* virt addr of handler ... */ - -#define EXCEPTION_PROLOG_1(area) \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ - std r9,area+EX_R9(r13); /* save r9 - r12 */ \ - std r10,area+EX_R10(r13); \ - std r11,area+EX_R11(r13); \ - std r12,area+EX_R12(r13); \ - mfspr r9,SPRN_SPRG1; \ - std r9,area+EX_R13(r13); \ - mfcr r9 - -#define EXCEPTION_PROLOG_PSERIES(area, label) \ - EXCEPTION_PROLOG_1(area); \ - ld r12,PACAKBASE(r13); /* get high part of &label */ \ - ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ - mfspr r11,SPRN_SRR0; /* save SRR0 */ \ - LOAD_HANDLER(r12,label) \ - mtspr SPRN_SRR0,r12; \ - mfspr r12,SPRN_SRR1; /* and SRR1 */ \ - mtspr SPRN_SRR1,r10; \ - rfid; \ - b . /* prevent speculative execution */ - -/* - * The common exception prolog is used for all except a few exceptions - * such as a segment miss on a kernel address. We have to be prepared - * to take another exception from the point where we first touch the - * kernel stack onwards. - * - * On entry r13 points to the paca, r9-r13 are saved in the paca, - * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and - * SRR1, and relocation is on. - */ -#define EXCEPTION_PROLOG_COMMON(n, area) \ - andi. r10,r12,MSR_PR; /* See if coming from user */ \ - mr r10,r1; /* Save r1 */ \ - subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \ - beq- 1f; \ - ld r1,PACAKSAVE(r13); /* kernel stack to use */ \ -1: cmpdi cr1,r1,0; /* check if r1 is in userspace */ \ - bge- cr1,2f; /* abort if it is */ \ - b 3f; \ -2: li r1,(n); /* will be reloaded later */ \ - sth r1,PACA_TRAP_SAVE(r13); \ - b bad_stack; \ -3: std r9,_CCR(r1); /* save CR in stackframe */ \ - std r11,_NIP(r1); /* save SRR0 in stackframe */ \ - std r12,_MSR(r1); /* save SRR1 in stackframe */ \ - std r10,0(r1); /* make stack chain pointer */ \ - std r0,GPR0(r1); /* save r0 in stackframe */ \ - std r10,GPR1(r1); /* save r1 in stackframe */ \ - ACCOUNT_CPU_USER_ENTRY(r9, r10); \ - std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ - ld r10,area+EX_R10(r13); \ - std r9,GPR9(r1); \ - std r10,GPR10(r1); \ - ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \ - ld r10,area+EX_R12(r13); \ - ld r11,area+EX_R13(r13); \ - std r9,GPR11(r1); \ - std r10,GPR12(r1); \ - std r11,GPR13(r1); \ - ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ - mflr r9; /* save LR in stackframe */ \ - std r9,_LINK(r1); \ - mfctr r10; /* save CTR in stackframe */ \ - std r10,_CTR(r1); \ - lbz r10,PACASOFTIRQEN(r13); \ - mfspr r11,SPRN_XER; /* save XER in stackframe */ \ - std r10,SOFTE(r1); \ - std r11,_XER(r1); \ - li r9,(n)+1; \ - std r9,_TRAP(r1); /* set trap number */ \ - li r10,0; \ - ld r11,exception_marker@toc(r2); \ - std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ - -/* - * Exception vectors. - */ -#define STD_EXCEPTION_PSERIES(n, label) \ - . = n; \ - .globl label##_pSeries; \ -label##_pSeries: \ - HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) - -#define HSTD_EXCEPTION_PSERIES(n, label) \ - . = n; \ - .globl label##_pSeries; \ -label##_pSeries: \ - HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r20; /* save r20 */ \ - mfspr r20,SPRN_HSRR0; /* copy HSRR0 to SRR0 */ \ - mtspr SPRN_SRR0,r20; \ - mfspr r20,SPRN_HSRR1; /* copy HSRR0 to SRR0 */ \ - mtspr SPRN_SRR1,r20; \ - mfspr r20,SPRN_SPRG1; /* restore r20 */ \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) - - -#define MASKABLE_EXCEPTION_PSERIES(n, label) \ - . = n; \ - .globl label##_pSeries; \ -label##_pSeries: \ - HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ - std r9,PACA_EXGEN+EX_R9(r13); /* save r9, r10 */ \ - std r10,PACA_EXGEN+EX_R10(r13); \ - lbz r10,PACASOFTIRQEN(r13); \ - mfcr r9; \ - cmpwi r10,0; \ - beq masked_interrupt; \ - mfspr r10,SPRN_SPRG1; \ - std r10,PACA_EXGEN+EX_R13(r13); \ - std r11,PACA_EXGEN+EX_R11(r13); \ - std r12,PACA_EXGEN+EX_R12(r13); \ - ld r12,PACAKBASE(r13); /* get high part of &label */ \ - ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ - mfspr r11,SPRN_SRR0; /* save SRR0 */ \ - LOAD_HANDLER(r12,label##_common) \ - mtspr SPRN_SRR0,r12; \ - mfspr r12,SPRN_SRR1; /* and SRR1 */ \ - mtspr SPRN_SRR1,r10; \ - rfid; \ - b . /* prevent speculative execution */ - -#ifdef CONFIG_PPC_ISERIES -#define DISABLE_INTS \ - li r11,0; \ - stb r11,PACASOFTIRQEN(r13); \ -BEGIN_FW_FTR_SECTION; \ - stb r11,PACAHARDIRQEN(r13); \ -END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES); \ - TRACE_DISABLE_INTS; \ -BEGIN_FW_FTR_SECTION; \ - mfmsr r10; \ - ori r10,r10,MSR_EE; \ - mtmsrd r10,1; \ -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#else -#define DISABLE_INTS \ - li r11,0; \ - stb r11,PACASOFTIRQEN(r13); \ - stb r11,PACAHARDIRQEN(r13); \ - TRACE_DISABLE_INTS -#endif /* CONFIG_PPC_ISERIES */ - -#define ENABLE_INTS \ - ld r12,_MSR(r1); \ - mfmsr r11; \ - rlwimi r11,r12,0,MSR_EE; \ - mtmsrd r11,1 - -#define STD_EXCEPTION_COMMON(trap, label, hdlr) \ - .align 7; \ - .globl label##_common; \ -label##_common: \ - EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ - DISABLE_INTS; \ - bl .save_nvgprs; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b .ret_from_except - -/* - * Like STD_EXCEPTION_COMMON, but for exceptions that can occur - * in the idle task and therefore need the special idle handling. - */ -#define STD_EXCEPTION_COMMON_IDLE(trap, label, hdlr) \ - .align 7; \ - .globl label##_common; \ -label##_common: \ - EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ - FINISH_NAP; \ - DISABLE_INTS; \ - bl .save_nvgprs; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b .ret_from_except - -#define STD_EXCEPTION_COMMON_LITE(trap, label, hdlr) \ - .align 7; \ - .globl label##_common; \ -label##_common: \ - EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ - FINISH_NAP; \ - DISABLE_INTS; \ -BEGIN_FTR_SECTION \ - bl .ppc64_runlatch_on; \ -END_FTR_SECTION_IFSET(CPU_FTR_CTRL) \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b .ret_from_except_lite - -/* - * When the idle code in power4_idle puts the CPU into NAP mode, - * it has to do so in a loop, and relies on the external interrupt - * and decrementer interrupt entry code to get it out of the loop. - * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags - * to signal that it is in the loop and needs help to get out. - */ -#ifdef CONFIG_PPC_970_NAP -#define FINISH_NAP \ -BEGIN_FTR_SECTION \ - clrrdi r11,r1,THREAD_SHIFT; \ - ld r9,TI_LOCAL_FLAGS(r11); \ - andi. r10,r9,_TLF_NAPPING; \ - bnel power4_fixup_nap; \ -END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) -#else -#define FINISH_NAP -#endif - -#endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index eb898112e57..72644cf22ca 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -12,6 +12,8 @@ * */ +#include + /* * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 012505ebd9f..9196ef36d43 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -36,7 +36,6 @@ #include #include #include -#include #include /* The physical memory is layed out such that the secondary processor diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h index ced45a8fa1a..e26eb86ac73 100644 --- a/arch/powerpc/platforms/iseries/exception.h +++ b/arch/powerpc/platforms/iseries/exception.h @@ -24,7 +24,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include +#include #define EXCEPTION_PROLOG_ISERIES_1 \ mfmsr r10; \ -- cgit v1.2.3-70-g09d2 From ee43eb788b3a06425fffb912677e2e1c8b00dd3b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:52:54 +0000 Subject: powerpc: Use names rather than numbers for SPRGs (v2) The kernel uses SPRG registers for various purposes, typically in low level assembly code as scratch registers or to hold per-cpu global infos such as the PACA or the current thread_info pointer. We want to be able to easily shuffle the usage of those registers as some implementations have specific constraints realted to some of them, for example, some have userspace readable aliases, etc.. and the current choice isn't always the best. This patch should not change any code generation, and replaces the usage of SPRN_SPRGn everywhere in the kernel with a named replacement and adds documentation next to the definition of the names as to what those are used for on each processor family. The only parts that still use the original numbers are bits of KVM or suspend/resume code that just blindly needs to save/restore all the SPRGs. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64s.h | 18 ++--- arch/powerpc/include/asm/reg.h | 113 ++++++++++++++++++++++++++ arch/powerpc/kernel/cpu_setup_6xx.S | 2 +- arch/powerpc/kernel/entry_32.S | 20 ++--- arch/powerpc/kernel/entry_64.S | 4 +- arch/powerpc/kernel/exceptions-64s.S | 44 ++++------ arch/powerpc/kernel/fpu.S | 2 +- arch/powerpc/kernel/head_32.S | 40 +++++----- arch/powerpc/kernel/head_40x.S | 124 ++++++++++++++--------------- arch/powerpc/kernel/head_44x.S | 56 ++++++------- arch/powerpc/kernel/head_64.S | 14 ++-- arch/powerpc/kernel/head_8xx.S | 13 +-- arch/powerpc/kernel/head_booke.h | 50 +++++------- arch/powerpc/kernel/head_fsl_booke.S | 60 +++++++------- arch/powerpc/kernel/setup_64.c | 4 +- arch/powerpc/kernel/vector.S | 2 +- arch/powerpc/kvm/booke_interrupts.S | 18 ++--- arch/powerpc/mm/hash_low_32.S | 4 +- arch/powerpc/platforms/iseries/exception.S | 28 +++---- arch/powerpc/platforms/iseries/exception.h | 4 +- 20 files changed, 356 insertions(+), 264 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index d3d4534e3c7..773e380b5fe 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -57,12 +57,12 @@ addi reg,reg,(label)-_stext; /* virt addr of handler ... */ #define EXCEPTION_PROLOG_1(area) \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ + mfspr r13,SPRN_SPRG_PACA; /* get paca address into r13 */ \ std r9,area+EX_R9(r13); /* save r9 - r12 */ \ std r10,area+EX_R10(r13); \ std r11,area+EX_R11(r13); \ std r12,area+EX_R12(r13); \ - mfspr r9,SPRN_SPRG1; \ + mfspr r9,SPRN_SPRG_SCRATCH0; \ std r9,area+EX_R13(r13); \ mfcr r9 @@ -144,7 +144,7 @@ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) #define HSTD_EXCEPTION_PSERIES(n, label) \ @@ -152,13 +152,13 @@ label##_pSeries: \ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r20; /* save r20 */ \ + mtspr SPRN_SPRG_SCRATCH0,r20; /* save r20 */ \ mfspr r20,SPRN_HSRR0; /* copy HSRR0 to SRR0 */ \ mtspr SPRN_SRR0,r20; \ mfspr r20,SPRN_HSRR1; /* copy HSRR0 to SRR0 */ \ mtspr SPRN_SRR1,r20; \ - mfspr r20,SPRN_SPRG1; /* restore r20 */ \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mfspr r20,SPRN_SPRG_SCRATCH0; /* restore r20 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) @@ -167,15 +167,15 @@ label##_pSeries: \ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ + mfspr r13,SPRN_SPRG_PACA; /* get paca address into r13 */ \ std r9,PACA_EXGEN+EX_R9(r13); /* save r9, r10 */ \ std r10,PACA_EXGEN+EX_R10(r13); \ lbz r10,PACASOFTIRQEN(r13); \ mfcr r9; \ cmpwi r10,0; \ beq masked_interrupt; \ - mfspr r10,SPRN_SPRG1; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ std r10,PACA_EXGEN+EX_R13(r13); \ std r11,PACA_EXGEN+EX_R11(r13); \ std r12,PACA_EXGEN+EX_R12(r13); \ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1170267736d..a8179cc99ac 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -645,6 +645,119 @@ #define MMCR0_PMC2_LOADMISSTIME 0x5 #endif +/* + * SPRG usage: + * + * All 64-bit: + * - SPRG3 stores PACA pointer + * + * 64-bit server: + * - SPRG0 unused (reserved for HV on Power4) + * - SPRG1 scratch for exception vectors + * - SPRG2 scratch for exception vectors + * + * All 32-bit: + * - SPRG3 current thread_info pointer + * (virtual on BookE, physical on others) + * + * 32-bit classic: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors + * - SPRG2 indicator that we are in RTAS + * - SPRG4 (603 only) pseudo TLB LRU data + * + * 32-bit 40x: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors + * - SPRG2 scratch for exception vectors + * - SPRG4 scratch for exception vectors (not 403) + * - SPRG5 scratch for exception vectors (not 403) + * - SPRG6 scratch for exception vectors (not 403) + * - SPRG7 scratch for exception vectors (not 403) + * + * 32-bit 440 and FSL BookE: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors (*) + * - SPRG2 scratch for crit interrupts handler + * - SPRG4 scratch for exception vectors + * - SPRG5 scratch for exception vectors + * - SPRG6 scratch for machine check handler + * - SPRG7 scratch for exception vectors + * - SPRG9 scratch for debug vectors (e500 only) + * + * Additionally, BookE separates "read" and "write" + * of those registers. That allows to use the userspace + * readable variant for reads, which can avoid a fault + * with KVM type virtualization. + * + * (*) Under KVM, the host SPRG1 is used to point to + * the current VCPU data structure + * + * 32-bit 8xx: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors + * - SPRG2 apparently unused but initialized + * + */ +#ifdef CONFIG_PPC64 +#define SPRN_SPRG_PACA SPRN_SPRG3 +#else +#define SPRN_SPRG_THREAD SPRN_SPRG3 +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG1 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG2 +#endif + +#ifdef CONFIG_PPC_BOOK3S_32 +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_RTAS SPRN_SPRG2 +#define SPRN_SPRG_603_LRU SPRN_SPRG4 +#endif + +#ifdef CONFIG_40x +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_SCRATCH2 SPRN_SPRG2 +#define SPRN_SPRG_SCRATCH3 SPRN_SPRG4 +#define SPRN_SPRG_SCRATCH4 SPRN_SPRG5 +#define SPRN_SPRG_SCRATCH5 SPRN_SPRG6 +#define SPRN_SPRG_SCRATCH6 SPRN_SPRG7 +#endif + +#ifdef CONFIG_BOOKE +#define SPRN_SPRG_RSCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_WSCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_RSCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_WSCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_RSCRATCH_CRIT SPRN_SPRG2 +#define SPRN_SPRG_WSCRATCH_CRIT SPRN_SPRG2 +#define SPRN_SPRG_RSCRATCH2 SPRN_SPRG4R +#define SPRN_SPRG_WSCRATCH2 SPRN_SPRG4W +#define SPRN_SPRG_RSCRATCH3 SPRN_SPRG5R +#define SPRN_SPRG_WSCRATCH3 SPRN_SPRG5W +#define SPRN_SPRG_RSCRATCH_MC SPRN_SPRG6R +#define SPRN_SPRG_WSCRATCH_MC SPRN_SPRG6W +#define SPRN_SPRG_RSCRATCH4 SPRN_SPRG7R +#define SPRN_SPRG_WSCRATCH4 SPRN_SPRG7W +#ifdef CONFIG_E200 +#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG6R +#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG6W +#else +#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG9 +#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG9 +#endif +#define SPRN_SPRG_RVCPU SPRN_SPRG1 +#define SPRN_SPRG_WVCPU SPRN_SPRG1 +#endif + +#ifdef CONFIG_8xx +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 +#endif + /* * An mtfsf instruction with the L bit set. On CPUs that support this a * full 64bits of FPSCR is restored and on other CPUs the L bit is ignored. diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index 1e9949e6885..55cba4a8a95 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -21,7 +21,7 @@ _GLOBAL(__setup_cpu_603) mflr r4 BEGIN_MMU_FTR_SECTION li r10,0 - mtspr SPRN_SPRG4,r10 /* init SW LRU tracking */ + mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) BEGIN_FTR_SECTION bl __init_fpu_registers diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 3cadba60a4b..1175a8539e6 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -88,7 +88,7 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,_SRR1(r11) - mfspr r8,SPRN_SPRG3 + mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,SAVED_KSP_LIMIT(r11) rlwimi r0,r1,0,0,(31-THREAD_SHIFT) @@ -108,7 +108,7 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,crit_srr1@l(0) - mfspr r8,SPRN_SPRG3 + mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,saved_ksp_limit@l(0) rlwimi r0,r1,0,0,(31-THREAD_SHIFT) @@ -138,7 +138,7 @@ transfer_to_handler: mfspr r2,SPRN_XER stw r12,_CTR(r11) stw r2,_XER(r11) - mfspr r12,SPRN_SPRG3 + mfspr r12,SPRN_SPRG_THREAD addi r2,r12,-THREAD tovirt(r2,r2) /* set r2 to current */ beq 2f /* if from user, fix up THREAD.regs */ @@ -680,7 +680,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) tophys(r0,r4) CLR_TOP32(r0) - mtspr SPRN_SPRG3,r0 /* Update current THREAD phys addr */ + mtspr SPRN_SPRG_THREAD,r0 /* Update current THREAD phys addr */ lwz r1,KSP(r4) /* Load new stack pointer */ /* save the old current 'last' for return value */ @@ -1057,7 +1057,7 @@ exc_exit_restart_end: #ifdef CONFIG_40x .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lis r10,saved_ksp_limit@ha; lwz r10,saved_ksp_limit@l(r10); tovirt(r9,r9); @@ -1074,7 +1074,7 @@ ret_from_crit_exc: #ifdef CONFIG_BOOKE .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); @@ -1083,7 +1083,7 @@ ret_from_crit_exc: .globl ret_from_debug_exc ret_from_debug_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) lwz r9,THREAD_INFO-THREAD(r9) @@ -1097,7 +1097,7 @@ ret_from_debug_exc: .globl ret_from_mcheck_exc ret_from_mcheck_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); @@ -1255,7 +1255,7 @@ _GLOBAL(enter_rtas) MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 - mtspr SPRN_SPRG2,r7 + mtspr SPRN_SPRG_RTAS,r7 mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI @@ -1265,7 +1265,7 @@ _GLOBAL(enter_rtas) FIX_SRR1(r9,r0) addi r1,r1,INT_FRAME_SIZE li r0,0 - mtspr SPRN_SPRG2,r0 + mtspr SPRN_SPRG_RTAS,r0 mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI /* return to caller */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 43e073477c3..dbf0e311561 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -762,7 +762,7 @@ _GLOBAL(enter_rtas) _STATIC(rtas_return_loc) /* relocation is off at this point */ - mfspr r4,SPRN_SPRG3 /* Get PACA */ + mfspr r4,SPRN_SPRG_PACA /* Get PACA */ clrldi r4,r4,2 /* convert to realmode address */ bcl 20,31,$+4 @@ -793,7 +793,7 @@ _STATIC(rtas_restore_regs) REST_8GPRS(14, r1) /* Restore the non-volatiles */ REST_10GPRS(22, r1) /* ditto */ - mfspr r13,SPRN_SPRG3 + mfspr r13,SPRN_SPRG_PACA ld r4,_CCR(r1) mtcr r4 diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 72644cf22ca..4e9640cc056 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -24,18 +24,6 @@ * 0x8000 - : Early init and support code */ - -/* - * SPRG Usage - * - * Register Definition - * - * SPRG0 reserved for hypervisor - * SPRG1 temp - used to save gpr - * SPRG2 temp - used to save gpr - * SPRG3 virt addr of paca - */ - /* * This is the start of the interrupt handlers for pSeries * This code runs with relocation off. @@ -53,16 +41,16 @@ __start_interrupts: . = 0x200 _machine_check_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) . = 0x300 .globl data_access_pSeries data_access_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 + mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG2,r12 + mtspr SPRN_SPRG_SCRATCH1,r12 mfspr r13,SPRN_DAR mfspr r12,SPRN_DSISR srdi r13,r13,60 @@ -71,7 +59,7 @@ BEGIN_FTR_SECTION cmpwi r13,0x2c beq do_stab_bolted_pSeries mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 END_FTR_SECTION_IFCLR(CPU_FTR_SLB) EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common) @@ -79,8 +67,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB) .globl data_access_slb_pSeries data_access_slb_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ @@ -93,7 +81,7 @@ data_access_slb_pSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) mfspr r12,SPRN_SRR1 /* and SRR1 */ #ifndef CONFIG_RELOCATABLE @@ -117,8 +105,8 @@ data_access_slb_pSeries: .globl instruction_access_slb_pSeries instruction_access_slb_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ @@ -131,7 +119,7 @@ instruction_access_slb_pSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) mfspr r12,SPRN_SRR1 /* and SRR1 */ #ifndef CONFIG_RELOCATABLE @@ -161,7 +149,7 @@ BEGIN_FTR_SECTION beq- 1f END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) mr r9,r13 - mfspr r13,SPRN_SPRG3 + mfspr r13,SPRN_SPRG_PACA mfspr r11,SPRN_SRR0 ld r12,PACAKBASE(r13) ld r10,PACAKMSR(r13) @@ -230,14 +218,14 @@ masked_interrupt: rotldi r10,r10,16 mtspr SPRN_SRR1,r10 ld r10,PACA_EXGEN+EX_R10(r13) - mfspr r13,SPRN_SPRG1 + mfspr r13,SPRN_SPRG_SCRATCH0 rfid b . .align 7 do_stab_bolted_pSeries: mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) #ifdef CONFIG_PPC_PSERIES @@ -248,14 +236,14 @@ do_stab_bolted_pSeries: .align 7 system_reset_fwnmi: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) .globl machine_check_fwnmi .align 7 machine_check_fwnmi: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) #endif /* CONFIG_PPC_PSERIES */ @@ -270,7 +258,7 @@ slb_miss_user_pseries: std r10,PACA_EXGEN+EX_R10(r13) std r11,PACA_EXGEN+EX_R11(r13) std r12,PACA_EXGEN+EX_R12(r13) - mfspr r10,SPRG1 + mfspr r10,SPRG_SCRATCH0 ld r11,PACA_EXSLB+EX_R9(r13) ld r12,PACA_EXSLB+EX_R3(r13) std r10,PACA_EXGEN+EX_R13(r13) diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 2436df33c6f..fc8f5b14019 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -91,7 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif /* CONFIG_SMP */ /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index fc213294275..829c3fe7c5a 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -244,8 +244,8 @@ __secondary_hold_acknowledge: * task's thread_struct. */ #define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; \ - mtspr SPRN_SPRG1,r11; \ + mtspr SPRN_SPRG_SCRATCH0,r10; \ + mtspr SPRN_SPRG_SCRATCH1,r11; \ mfcr r10; \ EXCEPTION_PROLOG_1; \ EXCEPTION_PROLOG_2 @@ -255,7 +255,7 @@ __secondary_hold_acknowledge: andi. r11,r11,MSR_PR; \ tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ - mfspr r11,SPRN_SPRG3; \ + mfspr r11,SPRN_SPRG_THREAD; \ lwz r11,THREAD_INFO-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ @@ -267,9 +267,9 @@ __secondary_hold_acknowledge: stw r10,_CCR(r11); /* save registers */ \ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ @@ -355,11 +355,11 @@ i##n: \ * -- paulus. */ . = 0x200 - mtspr SPRN_SPRG0,r10 - mtspr SPRN_SPRG1,r11 + mtspr SPRN_SPRG_SCRATCH0,r10 + mtspr SPRN_SPRG_SCRATCH1,r11 mfcr r10 #ifdef CONFIG_PPC_CHRP - mfspr r11,SPRN_SPRG2 + mfspr r11,SPRN_SPRG_RTAS cmpwi 0,r11,0 bne 7f #endif /* CONFIG_PPC_CHRP */ @@ -367,7 +367,7 @@ i##n: \ 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP - mfspr r4,SPRN_SPRG2 + mfspr r4,SPRN_SPRG_RTAS cmpwi cr1,r4,0 bne cr1,1f #endif @@ -485,7 +485,7 @@ InstructionTLBMiss: mfspr r3,SPRN_IMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ lwz r2,PGDIR(r2) bge- 112f @@ -559,7 +559,7 @@ DataLoadTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ lwz r2,PGDIR(r2) bge- 112f @@ -598,12 +598,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtcrf 0x80,r2 BEGIN_MMU_FTR_SECTION li r0,1 - mfspr r1,SPRN_SPRG4 + mfspr r1,SPRN_SPRG_603_LRU rlwinm r2,r3,20,27,31 /* Get Address bits 15:19 */ slw r0,r0,r2 xor r1,r0,r1 srw r0,r1,r2 - mtspr SPRN_SPRG4,r1 + mtspr SPRN_SPRG_603_LRU,r1 mfspr r2,SPRN_SRR1 rlwimi r2,r0,31-14,14,14 mtspr SPRN_SRR1,r2 @@ -643,7 +643,7 @@ DataStoreTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ lwz r2,PGDIR(r2) bge- 112f @@ -678,12 +678,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtcrf 0x80,r2 BEGIN_MMU_FTR_SECTION li r0,1 - mfspr r1,SPRN_SPRG4 + mfspr r1,SPRN_SPRG_603_LRU rlwinm r2,r3,20,27,31 /* Get Address bits 15:19 */ slw r0,r0,r2 xor r1,r0,r1 srw r0,r1,r2 - mtspr SPRN_SPRG4,r1 + mtspr SPRN_SPRG_603_LRU,r1 mfspr r2,SPRN_SRR1 rlwimi r2,r0,31-14,14,14 mtspr SPRN_SRR1,r2 @@ -864,9 +864,9 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ CLR_TOP32(r4) - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 - mtspr SPRN_SPRG2,r3 /* 0 => not in RTAS */ + mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -947,9 +947,9 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ CLR_TOP32(r4) - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 - mtspr SPRN_SPRG2,r3 /* 0 => not in RTAS */ + mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ /* stack */ lis r1,init_thread_union@ha diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 0c96911d429..a90625f9b48 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -103,21 +103,21 @@ _ENTRY(saved_ksp_limit) /* * Exception vector entry code. This code runs with address translation - * turned off (i.e. using physical addresses). We assume SPRG3 has the - * physical address of the current task thread_struct. + * turned off (i.e. using physical addresses). We assume SPRG_THREAD has + * the physical address of the current task thread_struct. * Note that we have to have decremented r1 before we write to any fields * of the exception frame, since a critical interrupt could occur at any * time, and it will write to the area immediately below the current r1. */ #define NORMAL_EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; /* save two registers to work with */\ - mtspr SPRN_SPRG1,r11; \ - mtspr SPRN_SPRG2,r1; \ + mtspr SPRN_SPRG_SCRATCH0,r10; /* save two registers to work with */\ + mtspr SPRN_SPRG_SCRATCH1,r11; \ + mtspr SPRN_SPRG_SCRATCH2,r1; \ mfcr r10; /* save CR in r10 for now */\ mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ beq 1f; \ - mfspr r1,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ addi r1,r1,THREAD_SIZE; \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ @@ -125,13 +125,13 @@ _ENTRY(saved_ksp_limit) stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ - mfspr r10,SPRN_SPRG2; \ + mfspr r10,SPRN_SPRG_SCRATCH2; \ mfspr r12,SPRN_SRR0; \ stw r10,GPR1(r11); \ mfspr r9,SPRN_SRR1; \ @@ -160,7 +160,7 @@ _ENTRY(saved_ksp_limit) lwz r11,critirq_ctx@l(r11); \ beq 1f; \ /* COMING FROM USER MODE */ \ - mfspr r11,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ 1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ tophys(r11,r11); \ @@ -265,8 +265,8 @@ label: * and exit. Otherwise, we call heavywight functions to do the work. */ START_EXCEPTION(0x0300, DataStorage) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -275,12 +275,12 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif /* First, check if it was a zone fault (which means a user @@ -308,7 +308,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -355,15 +355,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 PPC405_ERR77_SYNC rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ @@ -380,15 +380,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b DataAccess /* @@ -466,8 +466,8 @@ label: * load TLB entries from the page table if they exist. */ START_EXCEPTION(0x1100, DTLBMiss) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -476,12 +476,12 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif mfspr r10, SPRN_DEAR /* Get faulting address */ @@ -500,7 +500,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -550,15 +550,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b DataAccess /* 0x1200 - Instruction TLB Miss Exception @@ -566,8 +566,8 @@ label: * registers and bailout to a different point. */ START_EXCEPTION(0x1200, ITLBMiss) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -576,12 +576,12 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif mfspr r10, SPRN_SRR0 /* Get faulting address */ @@ -600,7 +600,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -650,15 +650,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b InstructionAccess EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_EE) @@ -803,15 +803,15 @@ finish_tlb_load: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 PPC405_ERR77_SYNC rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ @@ -835,7 +835,7 @@ start_here: /* ptr to phys current thread */ tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@ha diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 18d8a1677c4..656cfb2d666 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -239,7 +239,7 @@ skpinv: addi r4,r4,1 /* Increment */ /* ptr to current thread */ addi r4,r2,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@h @@ -350,12 +350,12 @@ interrupt_base: /* Data TLB Error Interrupt */ START_EXCEPTION(DataTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -374,7 +374,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) /* Load PID into MMUCR TID */ @@ -446,12 +446,12 @@ tlb_44x_patch_hwater_D: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b DataStorage /* Instruction TLB Error Interrupt */ @@ -461,12 +461,12 @@ tlb_44x_patch_hwater_D: * to a different point. */ START_EXCEPTION(InstructionTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -485,7 +485,7 @@ tlb_44x_patch_hwater_D: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) /* Load PID into MMUCR TID */ @@ -542,12 +542,12 @@ tlb_44x_patch_hwater_I: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage /* Debug Interrupt */ @@ -593,12 +593,12 @@ finish_tlb_load: /* Done...restore registers and get out of here. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 rfi /* Force context change */ /* diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 9196ef36d43..0552f01041a 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -195,7 +195,7 @@ _GLOBAL(generic_secondary_smp_init) mr r3,r24 /* not found, copy phys to r3 */ b .kexec_wait /* next kernel might do better */ -2: mtspr SPRN_SPRG3,r13 /* Save vaddr of paca in SPRG3 */ +2: mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG */ /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 3: HMT_LOW @@ -484,7 +484,7 @@ _GLOBAL(pmac_secondary_start) LOAD_REG_ADDR(r4,paca) /* Get base vaddr of paca array */ mulli r13,r24,PACA_SIZE /* Calculate vaddr of right paca */ add r13,r13,r4 /* for this processor. */ - mtspr SPRN_SPRG3,r13 /* Save vaddr of paca in SPRG3 */ + mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG*/ /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) @@ -502,10 +502,10 @@ _GLOBAL(pmac_secondary_start) * 1. Processor number * 2. Segment table pointer (virtual address) * On entry the following are set: - * r1 = stack pointer. vaddr for iSeries, raddr (temp stack) for pSeries - * r24 = cpu# (in Linux terms) - * r13 = paca virtual address - * SPRG3 = paca virtual address + * r1 = stack pointer. vaddr for iSeries, raddr (temp stack) for pSeries + * r24 = cpu# (in Linux terms) + * r13 = paca virtual address + * SPRG_PACA = paca virtual address */ .globl __secondary_start __secondary_start: @@ -641,7 +641,7 @@ _INIT_STATIC(start_here_multiplatform) /* Restore parameters passed from prom_init/kexec */ mr r3,r31 - bl .early_setup /* also sets r13 and SPRG3 */ + bl .early_setup /* also sets r13 and SPRG_PACA */ LOAD_REG_ADDR(r3, .start_here_common) ld r4,PACAKMSR(r13) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 52ff8c53b93..6ded19d0189 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -110,8 +110,8 @@ turn_on_mmu: * task's thread_struct. */ #define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; \ - mtspr SPRN_SPRG1,r11; \ + mtspr SPRN_SPRG_SCRATCH0,r10; \ + mtspr SPRN_SPRG_SCRATCH1,r11; \ mfcr r10; \ EXCEPTION_PROLOG_1; \ EXCEPTION_PROLOG_2 @@ -121,7 +121,7 @@ turn_on_mmu: andi. r11,r11,MSR_PR; \ tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ - mfspr r11,SPRN_SPRG3; \ + mfspr r11,SPRN_SPRG_THREAD; \ lwz r11,THREAD_INFO-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ @@ -133,9 +133,9 @@ turn_on_mmu: stw r10,_CCR(r11); /* save registers */ \ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ @@ -603,8 +603,9 @@ start_here: /* ptr to phys current thread */ tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 + /* XXX What is that for ? SPRG2 appears otherwise unused on 8xx */ mtspr SPRN_SPRG2,r3 /* 0 => r1 has kernel sp */ /* stack */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 5f9febc8d14..50504ae39cb 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -20,14 +20,14 @@ #endif #define NORMAL_EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; /* save two registers to work with */\ - mtspr SPRN_SPRG1,r11; \ - mtspr SPRN_SPRG4W,r1; \ + mtspr SPRN_SPRG_WSCRATCH0,r10;/* save two registers to work with */\ + mtspr SPRN_SPRG_WSCRATCH1,r11; \ + mtspr SPRN_SPRG_WSCRATCH2,r1; \ mfcr r10; /* save CR in r10 for now */\ mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ beq 1f; \ - mfspr r1,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ ALLOC_STACK_FRAME(r1, THREAD_SIZE); \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ @@ -35,13 +35,13 @@ stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_RSCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_RSCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ - mfspr r10,SPRN_SPRG4R; \ + mfspr r10,SPRN_SPRG_RSCRATCH2; \ mfspr r12,SPRN_SRR0; \ stw r10,GPR1(r11); \ mfspr r9,SPRN_SRR1; \ @@ -69,21 +69,11 @@ * providing configurations that micro-optimize space usage. */ -/* CRIT_SPRG only used in critical exception handling */ -#define CRIT_SPRG SPRN_SPRG2 -/* MCHECK_SPRG only used in machine check exception handling */ -#define MCHECK_SPRG SPRN_SPRG6W - -#define MCHECK_STACK_BASE mcheckirq_ctx +#define MC_STACK_BASE mcheckirq_ctx #define CRIT_STACK_BASE critirq_ctx /* only on e500mc/e200 */ -#define DEBUG_STACK_BASE dbgirq_ctx -#ifdef CONFIG_E200 -#define DEBUG_SPRG SPRN_SPRG6W -#else -#define DEBUG_SPRG SPRN_SPRG9 -#endif +#define DBG_STACK_BASE dbgirq_ctx #define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE) @@ -110,7 +100,7 @@ * critical/machine check exception stack at low physical addresses. */ #define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, exc_level_srr0, exc_level_srr1) \ - mtspr exc_level##_SPRG,r8; \ + mtspr SPRN_SPRG_WSCRATCH_##exc_level,r8; \ BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \ stw r9,GPR9(r8); /* save various registers */\ mfcr r9; /* save CR in r9 for now */\ @@ -119,7 +109,7 @@ stw r9,_CCR(r8); /* save CR on stack */\ mfspr r10,exc_level_srr1; /* check whether user or kernel */\ andi. r10,r10,MSR_PR; \ - mfspr r11,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ beq 1f; \ @@ -140,7 +130,7 @@ lwz r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11); \ stw r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8); \ mr r11,r8; \ -2: mfspr r8,exc_level##_SPRG; \ +2: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \ stw r12,GPR12(r11); /* save various registers */\ mflr r10; \ stw r10,_LINK(r11); \ @@ -161,9 +151,9 @@ #define CRITICAL_EXCEPTION_PROLOG \ EXC_LEVEL_EXCEPTION_PROLOG(CRIT, SPRN_CSRR0, SPRN_CSRR1) #define DEBUG_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(DEBUG, SPRN_DSRR0, SPRN_DSRR1) + EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1) #define MCHECK_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(MCHECK, SPRN_MCSRR0, SPRN_MCSRR1) + EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1) /* * Exception vectors. @@ -282,13 +272,13 @@ label: mtspr SPRN_DSRR1,r9; \ lwz r9,GPR9(r11); \ lwz r12,GPR12(r11); \ - mtspr DEBUG_SPRG,r8; \ - BOOKE_LOAD_EXC_LEVEL_STACK(DEBUG); /* r8 points to the debug stack */ \ + mtspr SPRN_SPRG_WSCRATCH_DBG,r8; \ + BOOKE_LOAD_EXC_LEVEL_STACK(DBG); /* r8 points to the debug stack */ \ lwz r10,GPR10(r8); \ lwz r11,GPR11(r8); \ - mfspr r8,DEBUG_SPRG; \ + mfspr r8,SPRN_SPRG_RSCRATCH_DBG; \ \ - PPC_RFDI; \ + PPC_RFDI; \ b .; \ \ /* continue normal handling for a debug exception... */ \ @@ -335,11 +325,11 @@ label: mtspr SPRN_CSRR1,r9; \ lwz r9,GPR9(r11); \ lwz r12,GPR12(r11); \ - mtspr CRIT_SPRG,r8; \ + mtspr SPRN_SPRG_WSCRATCH_CRIT,r8; \ BOOKE_LOAD_EXC_LEVEL_STACK(CRIT); /* r8 points to the debug stack */ \ lwz r10,GPR10(r8); \ lwz r11,GPR11(r8); \ - mfspr r8,CRIT_SPRG; \ + mfspr r8,SPRN_SPRG_RSCRATCH_CRIT; \ \ rfci; \ b .; \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 5bdcc06d294..eca80482ae7 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -361,7 +361,7 @@ skpinv: addi r6,r6,1 /* Increment */ /* ptr to current thread */ addi r4,r2,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@h @@ -532,12 +532,12 @@ interrupt_base: /* Data TLB Error Interrupt */ START_EXCEPTION(DataTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -557,7 +557,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: @@ -598,12 +598,12 @@ interrupt_base: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b DataStorage /* Instruction TLB Error Interrupt */ @@ -613,12 +613,12 @@ interrupt_base: * to a different point. */ START_EXCEPTION(InstructionTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -638,7 +638,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: @@ -666,12 +666,12 @@ interrupt_base: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage #ifdef CONFIG_SPE @@ -790,12 +790,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) tlbwe /* Done...restore registers and get out of here. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 rfi /* Force context change */ #ifdef CONFIG_SPE @@ -839,7 +839,7 @@ load_up_spe: #endif /* !CONFIG_SMP */ /* enable use of SPE after return */ oris r9,r9,MSR_SPE@h - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ li r4,1 li r10,THREAD_ACC stw r4,THREAD_USED_SPE(r5) @@ -1118,7 +1118,7 @@ __secondary_start: /* ptr to current thread */ addi r4,r2,THREAD /* address of our thread_struct */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* Setup the defaults for TLB entries */ li r4,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1f6816003eb..91b89b8d63d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -142,11 +142,11 @@ early_param("smt-enabled", early_smt_enabled); #define check_smt_enabled() #endif /* CONFIG_SMP */ -/* Put the paca pointer into r13 and SPRG3 */ +/* Put the paca pointer into r13 and SPRG_PACA */ void __init setup_paca(int cpu) { local_paca = &paca[cpu]; - mtspr(SPRN_SPRG3, local_paca); + mtspr(SPRN_SPRG_PACA, local_paca); } /* diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index ea4d64644d0..67b6916f0e9 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -65,7 +65,7 @@ _GLOBAL(load_up_altivec) 1: /* enable use of VMX after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ oris r9,r9,MSR_VEC@h #else ld r4,PACACURRENT(r13) diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index d0c6f841bbd..380a78cf484 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -56,8 +56,8 @@ .macro KVM_HANDLER ivor_nr _GLOBAL(kvmppc_handler_\ivor_nr) /* Get pointer to vcpu and record exit number. */ - mtspr SPRN_SPRG0, r4 - mfspr r4, SPRN_SPRG1 + mtspr SPRN_SPRG_WSCRATCH0, r4 + mfspr r4, SPRN_SPRG_RVCPU stw r5, VCPU_GPR(r5)(r4) stw r6, VCPU_GPR(r6)(r4) mfctr r5 @@ -95,7 +95,7 @@ _GLOBAL(kvmppc_handler_len) /* Registers: - * SPRG0: guest r4 + * SPRG_SCRATCH0: guest r4 * r4: vcpu pointer * r5: KVM exit number */ @@ -181,7 +181,7 @@ _GLOBAL(kvmppc_resume_host) stw r3, VCPU_LR(r4) mfxer r3 stw r3, VCPU_XER(r4) - mfspr r3, SPRN_SPRG0 + mfspr r3, SPRN_SPRG_RSCRATCH0 stw r3, VCPU_GPR(r4)(r4) mfspr r3, SPRN_SRR0 stw r3, VCPU_PC(r4) @@ -374,7 +374,7 @@ lightweight_exit: mtspr SPRN_IVPR, r8 /* Save vcpu pointer for the exception handlers. */ - mtspr SPRN_SPRG1, r4 + mtspr SPRN_SPRG_WVCPU, r4 /* Can't switch the stack pointer until after IVPR is switched, * because host interrupt handlers would get confused. */ @@ -384,13 +384,13 @@ lightweight_exit: /* Host interrupt handlers may have clobbered these guest-readable * SPRGs, so we need to reload them here with the guest's values. */ lwz r3, VCPU_SPRG4(r4) - mtspr SPRN_SPRG4, r3 + mtspr SPRN_SPRG4W, r3 lwz r3, VCPU_SPRG5(r4) - mtspr SPRN_SPRG5, r3 + mtspr SPRN_SPRG5W, r3 lwz r3, VCPU_SPRG6(r4) - mtspr SPRN_SPRG6, r3 + mtspr SPRN_SPRG6W, r3 lwz r3, VCPU_SPRG7(r4) - mtspr SPRN_SPRG7, r3 + mtspr SPRN_SPRG7W, r3 #ifdef CONFIG_KVM_EXIT_TIMING /* save enter time */ diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 14af8cedab7..b13d58932bf 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -40,7 +40,7 @@ mmu_hash_lock: * The address is in r4, and r3 contains an access flag: * _PAGE_RW (0x400) if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. - * SPRG3 contains the physical address of the current task's thread. + * SPRG_THREAD contains the physical address of the current task's thread. * * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE @@ -68,7 +68,7 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S index 2f581521eb9..2b807597923 100644 --- a/arch/powerpc/platforms/iseries/exception.S +++ b/arch/powerpc/platforms/iseries/exception.S @@ -47,7 +47,7 @@ system_reset_iSeries: LOAD_REG_ADDR(r13, paca) mulli r0,r23,PACA_SIZE add r13,r13,r0 - mtspr SPRN_SPRG3,r13 /* Save it away for the future */ + mtspr SPRN_SPRG_PACA,r13 /* Save it away for the future */ mfmsr r24 ori r24,r24,MSR_RI mtmsrd r24 /* RI on */ @@ -116,7 +116,7 @@ iSeries_secondary_smp_loop: #endif /* CONFIG_SMP */ li r0,-1 /* r0=-1 indicates a Hypervisor call */ sc /* Invoke the hypervisor via a system call */ - mfspr r13,SPRN_SPRG3 /* Put r13 back ???? */ + mfspr r13,SPRN_SPRG_PACA /* Put r13 back ???? */ b 2b /* If SMP not configured, secondaries * loop forever */ @@ -126,9 +126,9 @@ iSeries_secondary_smp_loop: .globl data_access_iSeries data_access_iSeries: - mtspr SPRN_SPRG1,r13 + mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG2,r12 + mtspr SPRN_SPRG_SCRATCH1,r12 mfspr r13,SPRN_DAR mfspr r12,SPRN_DSISR srdi r13,r13,60 @@ -137,7 +137,7 @@ BEGIN_FTR_SECTION cmpwi r13,0x2c beq .do_stab_bolted_iSeries mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 END_FTR_SECTION_IFCLR(CPU_FTR_SLB) EXCEPTION_PROLOG_1(PACA_EXGEN) EXCEPTION_PROLOG_ISERIES_1 @@ -145,15 +145,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB) .do_stab_bolted_iSeries: mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 EXCEPTION_PROLOG_1(PACA_EXSLB) EXCEPTION_PROLOG_ISERIES_1 b .do_stab_bolted .globl data_access_slb_iSeries data_access_slb_iSeries: - mtspr SPRN_SPRG1,r13 /* save r13 */ - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR std r9,PACA_EXSLB+EX_R9(r13) @@ -165,7 +165,7 @@ data_access_slb_iSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) ld r12,PACALPPACAPTR(r13) ld r12,LPPACASRR1(r12) @@ -175,8 +175,8 @@ data_access_slb_iSeries: .globl instruction_access_slb_iSeries instruction_access_slb_iSeries: - mtspr SPRN_SPRG1,r13 /* save r13 */ - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) ld r3,PACALPPACAPTR(r13) ld r3,LPPACASRR0(r3) /* get SRR0 value */ @@ -189,7 +189,7 @@ instruction_access_slb_iSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) ld r12,PACALPPACAPTR(r13) ld r12,LPPACASRR1(r12) @@ -200,7 +200,7 @@ slb_miss_user_iseries: std r10,PACA_EXGEN+EX_R10(r13) std r11,PACA_EXGEN+EX_R11(r13) std r12,PACA_EXGEN+EX_R12(r13) - mfspr r10,SPRG1 + mfspr r10,SPRG_SCRATCH0 ld r11,PACA_EXSLB+EX_R9(r13) ld r12,PACA_EXSLB+EX_R3(r13) std r10,PACA_EXGEN+EX_R13(r13) @@ -221,7 +221,7 @@ slb_miss_user_iseries: .globl system_call_iSeries system_call_iSeries: mr r9,r13 - mfspr r13,SPRN_SPRG3 + mfspr r13,SPRN_SPRG_PACA EXCEPTION_PROLOG_ISERIES_1 b system_call_common diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h index e26eb86ac73..bae3fba5ad8 100644 --- a/arch/powerpc/platforms/iseries/exception.h +++ b/arch/powerpc/platforms/iseries/exception.h @@ -38,7 +38,7 @@ .globl label##_iSeries; \ label##_iSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_1(area); \ EXCEPTION_PROLOG_ISERIES_1; \ b label##_common @@ -47,7 +47,7 @@ label##_iSeries: \ .globl label##_iSeries; \ label##_iSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_1(PACA_EXGEN); \ lbz r10,PACASOFTIRQEN(r13); \ cmpwi 0,r10,0; \ -- cgit v1.2.3-70-g09d2 From c5a8c0c99f67ae8a784faafbaaea1529825796e2 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 16 Jul 2009 19:36:57 +0000 Subject: powerpc: Remove use of a second scratch SPRG in STAB code The STAB code used on Power3 and RS/64 uses a second scratch SPRG to save a GPR in order to decide whether to go to do_stab_bolted_* or to handle a normal data access exception. This prevents our scheme of freeing SPRG3 which is user visible for user uses since we cannot use SPRG0 which, on RS/64, seems to be read-only for supervisor mode (like POWER4). This reworks the STAB exception entry to use the PACA as temporary storage instead. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64s.h | 7 ++++-- arch/powerpc/include/asm/reg.h | 3 +-- arch/powerpc/kernel/exceptions-64s.S | 38 ++++++++++++++++++++---------- arch/powerpc/platforms/iseries/exception.S | 37 +++++++++++++++++++---------- 4 files changed, 55 insertions(+), 30 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 773e380b5fe..a98653b2623 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -66,8 +66,7 @@ std r9,area+EX_R13(r13); \ mfcr r9 -#define EXCEPTION_PROLOG_PSERIES(area, label) \ - EXCEPTION_PROLOG_1(area); \ +#define EXCEPTION_PROLOG_PSERIES_1(label) \ ld r12,PACAKBASE(r13); /* get high part of &label */ \ ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ mfspr r11,SPRN_SRR0; /* save SRR0 */ \ @@ -78,6 +77,10 @@ rfid; \ b . /* prevent speculative execution */ +#define EXCEPTION_PROLOG_PSERIES(area, label) \ + EXCEPTION_PROLOG_1(area); \ + EXCEPTION_PROLOG_PSERIES_1(label); + /* * The common exception prolog is used for all except a few exceptions * such as a segment miss on a kernel address. We have to be prepared diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a8179cc99ac..d17af2b3d4c 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -654,7 +654,7 @@ * 64-bit server: * - SPRG0 unused (reserved for HV on Power4) * - SPRG1 scratch for exception vectors - * - SPRG2 scratch for exception vectors + * - SPRG2 unused * * All 32-bit: * - SPRG3 current thread_info pointer @@ -707,7 +707,6 @@ #ifdef CONFIG_PPC_BOOK3S_64 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG1 -#define SPRN_SPRG_SCRATCH1 SPRN_SPRG2 #endif #ifdef CONFIG_PPC_BOOK3S_32 diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4e9640cc056..50f2ad36ed0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -50,18 +50,28 @@ data_access_pSeries: HMT_MEDIUM mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG_SCRATCH1,r12 - mfspr r13,SPRN_DAR - mfspr r12,SPRN_DSISR - srdi r13,r13,60 - rlwimi r13,r12,16,0x20 - mfcr r12 - cmpwi r13,0x2c + mfspr r13,SPRN_SPRG_PACA + std r9,PACA_EXSLB+EX_R9(r13) + std r10,PACA_EXSLB+EX_R10(r13) + mfspr r10,SPRN_DAR + mfspr r9,SPRN_DSISR + srdi r10,r10,60 + rlwimi r10,r9,16,0x20 + mfcr r9 + cmpwi r10,0x2c beq do_stab_bolted_pSeries - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 -END_FTR_SECTION_IFCLR(CPU_FTR_SLB) + ld r10,PACA_EXSLB+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + ld r11,PACA_EXSLB+EX_R9(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfspr r12,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R9(r13) + std r12,PACA_EXGEN+EX_R13(r13) + EXCEPTION_PROLOG_PSERIES_1(data_access_common) +FTR_SECTION_ELSE EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB) . = 0x380 .globl data_access_slb_pSeries @@ -224,9 +234,11 @@ masked_interrupt: .align 7 do_stab_bolted_pSeries: - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 - EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) + std r11,PACA_EXSLB+EX_R11(r13) + std r12,PACA_EXSLB+EX_R12(r13) + mfspr r10,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXSLB+EX_R13(r13) + EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted) #ifdef CONFIG_PPC_PSERIES /* diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S index 2b807597923..5369653dcf6 100644 --- a/arch/powerpc/platforms/iseries/exception.S +++ b/arch/powerpc/platforms/iseries/exception.S @@ -128,25 +128,36 @@ iSeries_secondary_smp_loop: data_access_iSeries: mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG_SCRATCH1,r12 - mfspr r13,SPRN_DAR - mfspr r12,SPRN_DSISR - srdi r13,r13,60 - rlwimi r13,r12,16,0x20 - mfcr r12 - cmpwi r13,0x2c + mfspr r13,SPRN_SPRG_PACA + std r9,PACA_EXSLB+EX_R9(r13) + std r10,PACA_EXSLB+EX_R10(r13) + mfspr r10,SPRN_DAR + mfspr r9,SPRN_DSISR + srdi r10,r10,60 + rlwimi r10,r9,16,0x20 + mfcr r9 + cmpwi r10,0x2c beq .do_stab_bolted_iSeries - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 -END_FTR_SECTION_IFCLR(CPU_FTR_SLB) + ld r10,PACA_EXSLB+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + ld r11,PACA_EXSLB+EX_R9(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfspr r12,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R9(r13) + std r12,PACA_EXGEN+EX_R13(r13) + EXCEPTION_PROLOG_ISERIES_1 +FTR_SECTION_ELSE EXCEPTION_PROLOG_1(PACA_EXGEN) EXCEPTION_PROLOG_ISERIES_1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB) b data_access_common .do_stab_bolted_iSeries: - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 - EXCEPTION_PROLOG_1(PACA_EXSLB) + std r11,PACA_EXSLB+EX_R11(r13) + std r12,PACA_EXSLB+EX_R12(r13) + mfspr r10,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXSLB+EX_R13(r13) EXCEPTION_PROLOG_ISERIES_1 b .do_stab_bolted -- cgit v1.2.3-70-g09d2 From 063517bea114d4cb57bf582353d0a99b82775a63 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:52:56 +0000 Subject: powerpc: Change PACA from SPRG3 to SPRG1 This change the SPRG used to store the PACA on ppc64 from SPRG3 to SPRG1. SPRG3 is user readable on most processors and we want to use it for other things. We change the scratch SPRG used by exception vectors from SRPG1 to SPRG2. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/reg.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index d17af2b3d4c..2cedbb42761 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -649,12 +649,12 @@ * SPRG usage: * * All 64-bit: - * - SPRG3 stores PACA pointer + * - SPRG1 stores PACA pointer * * 64-bit server: * - SPRG0 unused (reserved for HV on Power4) - * - SPRG1 scratch for exception vectors - * - SPRG2 unused + * - SPRG2 scratch for exception vectors + * - SPRG3 unused (user visible) * * All 32-bit: * - SPRG3 current thread_info pointer @@ -700,13 +700,13 @@ * */ #ifdef CONFIG_PPC64 -#define SPRN_SPRG_PACA SPRN_SPRG3 +#define SPRN_SPRG_PACA SPRN_SPRG1 #else #define SPRN_SPRG_THREAD SPRN_SPRG3 #endif #ifdef CONFIG_PPC_BOOK3S_64 -#define SPRN_SPRG_SCRATCH0 SPRN_SPRG1 +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG2 #endif #ifdef CONFIG_PPC_BOOK3S_32 -- cgit v1.2.3-70-g09d2 From dd90bbd5fb763ab8924135a30956030c7a7b94fc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Jul 2009 11:54:32 +1000 Subject: powerpc: Add compat_sys_truncate The truncate syscall has a signed long parameter, so when using a 32- bit userspace with a 64-bit kernel the argument is zero-extended instead of sign-extended. Adding the compat_sys_truncate function fixes the issue. This was noticed during an LSB truncate test failure. The test was checking for the correct error number set when truncate is called with a length of -1. The test can be found at: http://bzr.linuxfoundation.org/lsb/devel/runtime-test?cmd=inventory;rev=stewb%40linux-foundation.org-20090626205411-sfb23cc0tjj7jzgm;path=modules/vsx-pcts/tset/POSIX.os/files/truncate/ BenH: Added compat_sys_ftruncate() as well, same issue. Signed-off-by: Chase Douglas Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/systbl.h | 4 ++-- arch/powerpc/kernel/sys_ppc32.c | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 370600ca276..ed24bd92fe4 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -95,8 +95,8 @@ SYSCALL(reboot) SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir) SYSCALL_SPU(mmap) SYSCALL_SPU(munmap) -SYSCALL_SPU(truncate) -SYSCALL_SPU(ftruncate) +COMPAT_SYS_SPU(truncate) +COMPAT_SYS_SPU(ftruncate) SYSCALL_SPU(fchmod) SYSCALL_SPU(fchown) COMPAT_SYS_SPU(getpriority) diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index bb1cfcfdbbb..1cc5e9e5da9 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -343,6 +343,18 @@ off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin) return sys_lseek(fd, (int)offset, origin); } +long compat_sys_truncate(const char __user * path, u32 length) +{ + /* sign extend length */ + return sys_truncate(path, (int)length); +} + +long compat_sys_ftruncate(int fd, u32 length) +{ + /* sign extend length */ + return sys_ftruncate(fd, (int)length); +} + /* Note: it is necessary to treat bufsiz as an unsigned int, * with the corresponding cast to a signed int to insure that the * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) -- cgit v1.2.3-70-g09d2 From 7d60b02cc7e6d67b498eed9ecb58010f61422325 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:04 +0000 Subject: powerpc/mm: Fix misplaced #endif in pgtable-ppc64-64k.h A misplaced #endif causes more definitions than intended to be protected by #ifndef __ASSEMBLY__. This breaks upcoming 64-bit BookE support patch when using 64k pages. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc64-64k.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h index 6cc085b945a..90533ddcd70 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h +++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h @@ -10,10 +10,10 @@ #define PGD_INDEX_SIZE 4 #ifndef __ASSEMBLY__ - #define PTE_TABLE_SIZE (sizeof(real_pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) +#endif /* __ASSEMBLY__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) @@ -32,8 +32,6 @@ #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#endif /* __ASSEMBLY__ */ - /* Bits to mask out from a PMD to get to the PTE page */ #define PMD_MASKED_BITS 0x1ff /* Bits to mask out from a PGD/PUD to get to the PMD page */ -- cgit v1.2.3-70-g09d2 From fcce810986b3f32a8322faf240f8cc5560a4c463 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:10 +0000 Subject: powerpc/mm: Add HW threads support to no_hash TLB management The current "no hash" MMU context management code is written with the assumption that one CPU == one TLB. This is not the case on implementations that support HW multithreading, where several linux CPUs can share the same TLB. This adds some basic support for this to our context management and our TLB flushing code. It also cleans up the optional debugging output a bit Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/cputhreads.h | 16 ++++++ arch/powerpc/mm/mmu_context_nohash.c | 93 +++++++++++++++++++++++------------ arch/powerpc/mm/tlb_nohash.c | 10 +++- 3 files changed, 86 insertions(+), 33 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index fb11b0c459b..a8e18447c62 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -5,6 +5,15 @@ /* * Mapping of threads to cores + * + * Note: This implementation is limited to a power of 2 number of + * threads per core and the same number for each core in the system + * (though it would work if some processors had less threads as long + * as the CPU numbers are still allocated, just not brought offline). + * + * However, the API allows for a different implementation in the future + * if needed, as long as you only use the functions and not the variables + * directly. */ #ifdef CONFIG_SMP @@ -67,5 +76,12 @@ static inline int cpu_first_thread_in_core(int cpu) return cpu & ~(threads_per_core - 1); } +static inline int cpu_last_thread_in_core(int cpu) +{ + return cpu | (threads_per_core - 1); +} + + + #endif /* _ASM_POWERPC_CPUTHREADS_H */ diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index b1a727def15..834436d6d6b 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -25,10 +25,20 @@ * also clear mm->cpu_vm_mask bits when processes are migrated */ -#undef DEBUG -#define DEBUG_STEAL_ONLY -#undef DEBUG_MAP_CONSISTENCY -/*#define DEBUG_CLAMP_LAST_CONTEXT 15 */ +#define DEBUG_MAP_CONSISTENCY +#define DEBUG_CLAMP_LAST_CONTEXT 31 +//#define DEBUG_HARDER + +/* We don't use DEBUG because it tends to be compiled in always nowadays + * and this would generate way too much output + */ +#ifdef DEBUG_HARDER +#define pr_hard(args...) printk(KERN_DEBUG args) +#define pr_hardcont(args...) printk(KERN_CONT args) +#else +#define pr_hard(args...) do { } while(0) +#define pr_hardcont(args...) do { } while(0) +#endif #include #include @@ -71,7 +81,7 @@ static DEFINE_SPINLOCK(context_lock); static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; - unsigned int cpu, max; + unsigned int cpu, max, i; max = last_context - first_context; @@ -89,15 +99,22 @@ static unsigned int steal_context_smp(unsigned int id) id = first_context; continue; } - pr_devel("[%d] steal context %d from mm @%p\n", - smp_processor_id(), id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Mark this mm has having no context anymore */ mm->context.id = MMU_NO_CONTEXT; - /* Mark it stale on all CPUs that used this mm */ - for_each_cpu(cpu, mm_cpumask(mm)) - __set_bit(id, stale_map[cpu]); + /* Mark it stale on all CPUs that used this mm. For threaded + * implementations, we set it on all threads on each core + * represented in the mask. A future implementation will use + * a core map instead but this will do for now. + */ + for_each_cpu(cpu, mm_cpumask(mm)) { + for (i = cpu_first_thread_in_core(cpu); + i <= cpu_last_thread_in_core(cpu); i++) + __set_bit(id, stale_map[i]); + cpu = i - 1; + } return id; } @@ -126,7 +143,7 @@ static unsigned int steal_context_up(unsigned int id) /* Pick up the victim mm */ mm = context_mm[id]; - pr_devel("[%d] steal context %d from mm @%p\n", cpu, id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Flush the TLB for that context */ local_flush_tlb_mm(mm); @@ -179,19 +196,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* No lockless fast path .. yet */ spin_lock(&context_lock); -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n", - cpu, next, next->context.active, next->context.id); -#endif + pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", + cpu, next, next->context.active, next->context.id); #ifdef CONFIG_SMP /* Mark us active and the previous one not anymore */ next->context.active++; if (prev) { -#ifndef DEBUG_STEAL_ONLY - pr_devel(" old context %p active was: %d\n", - prev, prev->context.active); -#endif + pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); WARN_ON(prev->context.active < 1); prev->context.active--; } @@ -201,8 +213,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* If we already have a valid assigned context, skip all that */ id = next->context.id; - if (likely(id != MMU_NO_CONTEXT)) + if (likely(id != MMU_NO_CONTEXT)) { +#ifdef DEBUG_MAP_CONSISTENCY + if (context_mm[id] != next) + pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", + next, id, id, context_mm[id]); +#endif goto ctxt_ok; + } /* We really don't have a context, let's try to acquire one */ id = next_context; @@ -235,11 +253,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) next_context = id + 1; context_mm[id] = next; next->context.id = id; - -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] picked up new id %d, nrf is now %d\n", - cpu, id, nr_free_contexts); -#endif + pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); context_check_map(); ctxt_ok: @@ -248,15 +262,20 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) * local TLB for it and unmark it before we use it */ if (test_bit(id, stale_map[cpu])) { - pr_devel("[%d] flushing stale context %d for mm @%p !\n", - cpu, id, next); + pr_hardcont(" | stale flush %d [%d..%d]", + id, cpu_first_thread_in_core(cpu), + cpu_last_thread_in_core(cpu)); + local_flush_tlb_mm(next); /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - __clear_bit(id, stale_map[cpu]); + for (cpu = cpu_first_thread_in_core(cpu); + cpu <= cpu_last_thread_in_core(cpu); cpu++) + __clear_bit(id, stale_map[cpu]); } /* Flick the MMU and release lock */ + pr_hardcont(" -> %d\n", id); set_context(id, next->pgd); spin_unlock(&context_lock); } @@ -266,6 +285,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { + pr_hard("initing context for mm @%p\n", mm); + mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; @@ -305,7 +326,9 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; - +#ifdef CONFIG_HOTPLUG_CPU + struct task_struct *p; +#endif /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ @@ -324,8 +347,16 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); kfree(stale_map[cpu]); stale_map[cpu] = NULL; - break; -#endif + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + read_lock(&tasklist_lock); + for_each_process(p) { + if (p->mm) + cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm)); + } + read_unlock(&tasklist_lock); + break; +#endif /* CONFIG_HOTPLUG_CPU */ } return NOTIFY_OK; } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ad2eb4d34dd..d908e75cc3b 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -87,6 +87,12 @@ EXPORT_SYMBOL(local_flush_tlb_page); static DEFINE_SPINLOCK(tlbivax_lock); +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_thread_cpumask(smp_processor_id())); +} + struct tlb_flush_param { unsigned long addr; unsigned int pid; @@ -131,7 +137,7 @@ void flush_tlb_mm(struct mm_struct *mm) pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto no_context; - if (!cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { + if (!mm_is_core_local(mm)) { struct tlb_flush_param p = { .pid = pid }; /* Ignores smp_processor_id() even if set. */ smp_call_function_many(mm_cpumask(mm), @@ -153,7 +159,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) if (unlikely(pid == MMU_NO_CONTEXT)) goto bail; cpu_mask = mm_cpumask(vma->vm_mm); - if (!cpumask_equal(cpu_mask, cpumask_of(smp_processor_id()))) { + if (!mm_is_core_local(mm)) { /* If broadcast tlbivax is supported, use it */ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); -- cgit v1.2.3-70-g09d2 From 29c09e8fbaf65698c51aeffe34acc284a454a38f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:11 +0000 Subject: powerpc/mm: Add opcode definitions for tlbivax and tlbsrx. This adds the opcode definitions to ppc-opcode.h for the two instructions tlbivax and tlbsrx. as defined by Book3E 2.06 Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/ppc-opcode.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index b74f16d45cb..ef9aa84cac5 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -48,6 +48,8 @@ #define PPC_INST_TLBIE 0x7c000264 #define PPC_INST_TLBILX 0x7c000024 #define PPC_INST_WAIT 0x7c00007c +#define PPC_INST_TLBIVAX 0x7c000624 +#define PPC_INST_TLBSRX_DOT 0x7c0006a5 /* macros to insert fields into opcodes */ #define __PPC_RA(a) (((a) & 0x1f) << 16) @@ -76,6 +78,10 @@ __PPC_WC(w)) #define PPC_TLBIE(lp,a) stringify_in_c(.long PPC_INST_TLBIE | \ __PPC_RB(a) | __PPC_RS(lp)) +#define PPC_TLBSRX_DOT(a,b) stringify_in_c(.long PPC_INST_TLBSRX_DOT | \ + __PPC_RA(a) | __PPC_RB(b)) +#define PPC_TLBIVAX(a,b) stringify_in_c(.long PPC_INST_TLBIVAX | \ + __PPC_RA(a) | __PPC_RB(b)) /* * Define what the VSX XX1 form instructions will look like, then add -- cgit v1.2.3-70-g09d2 From 1fe1a21005c14ad772caeb9005580f473c4b6c57 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:12 +0000 Subject: powerpc/mm: Add more bit definitions for Book3E MMU registers This adds various additional bit definitions for various MMU related SPRs used on Book3E. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-book3e.h | 168 ++++++++++++++++++++++++---------- 1 file changed, 119 insertions(+), 49 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 7e74cff81d8..42a39b4aace 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -38,58 +38,128 @@ #define BOOK3E_PAGESZ_1TB 30 #define BOOK3E_PAGESZ_2TB 31 -#define MAS0_TLBSEL(x) ((x << 28) & 0x30000000) -#define MAS0_ESEL(x) ((x << 16) & 0x0FFF0000) -#define MAS0_NV(x) ((x) & 0x00000FFF) - -#define MAS1_VALID 0x80000000 -#define MAS1_IPROT 0x40000000 -#define MAS1_TID(x) ((x << 16) & 0x3FFF0000) -#define MAS1_IND 0x00002000 -#define MAS1_TS 0x00001000 -#define MAS1_TSIZE(x) ((x << 7) & 0x00000F80) - -#define MAS2_EPN 0xFFFFF000 -#define MAS2_X0 0x00000040 -#define MAS2_X1 0x00000020 -#define MAS2_W 0x00000010 -#define MAS2_I 0x00000008 -#define MAS2_M 0x00000004 -#define MAS2_G 0x00000002 -#define MAS2_E 0x00000001 +/* MAS registers bit definitions */ + +#define MAS0_TLBSEL(x) ((x << 28) & 0x30000000) +#define MAS0_ESEL(x) ((x << 16) & 0x0FFF0000) +#define MAS0_NV(x) ((x) & 0x00000FFF) +#define MAS0_HES 0x00004000 +#define MAS0_WQ_ALLWAYS 0x00000000 +#define MAS0_WQ_COND 0x00001000 +#define MAS0_WQ_CLR_RSRV 0x00002000 + +#define MAS1_VALID 0x80000000 +#define MAS1_IPROT 0x40000000 +#define MAS1_TID(x) ((x << 16) & 0x3FFF0000) +#define MAS1_IND 0x00002000 +#define MAS1_TS 0x00001000 +#define MAS1_TSIZE_MASK 0x00000f80 +#define MAS1_TSIZE_SHIFT 7 +#define MAS1_TSIZE(x) ((x << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) + +#define MAS2_EPN 0xFFFFF000 +#define MAS2_X0 0x00000040 +#define MAS2_X1 0x00000020 +#define MAS2_W 0x00000010 +#define MAS2_I 0x00000008 +#define MAS2_M 0x00000004 +#define MAS2_G 0x00000002 +#define MAS2_E 0x00000001 #define MAS2_EPN_MASK(size) (~0 << (size + 10)) #define MAS2_VAL(addr, size, flags) ((addr) & MAS2_EPN_MASK(size) | (flags)) -#define MAS3_RPN 0xFFFFF000 -#define MAS3_U0 0x00000200 -#define MAS3_U1 0x00000100 -#define MAS3_U2 0x00000080 -#define MAS3_U3 0x00000040 -#define MAS3_UX 0x00000020 -#define MAS3_SX 0x00000010 -#define MAS3_UW 0x00000008 -#define MAS3_SW 0x00000004 -#define MAS3_UR 0x00000002 -#define MAS3_SR 0x00000001 - -#define MAS4_TLBSELD(x) MAS0_TLBSEL(x) -#define MAS4_INDD 0x00008000 -#define MAS4_TSIZED(x) MAS1_TSIZE(x) -#define MAS4_X0D 0x00000040 -#define MAS4_X1D 0x00000020 -#define MAS4_WD 0x00000010 -#define MAS4_ID 0x00000008 -#define MAS4_MD 0x00000004 -#define MAS4_GD 0x00000002 -#define MAS4_ED 0x00000001 - -#define MAS6_SPID0 0x3FFF0000 -#define MAS6_SPID1 0x00007FFE -#define MAS6_ISIZE(x) MAS1_TSIZE(x) -#define MAS6_SAS 0x00000001 -#define MAS6_SPID MAS6_SPID0 - -#define MAS7_RPN 0xFFFFFFFF +#define MAS3_RPN 0xFFFFF000 +#define MAS3_U0 0x00000200 +#define MAS3_U1 0x00000100 +#define MAS3_U2 0x00000080 +#define MAS3_U3 0x00000040 +#define MAS3_UX 0x00000020 +#define MAS3_SX 0x00000010 +#define MAS3_UW 0x00000008 +#define MAS3_SW 0x00000004 +#define MAS3_UR 0x00000002 +#define MAS3_SR 0x00000001 +#define MAS3_SPSIZE 0x0000003e +#define MAS3_SPSIZE_SHIFT 1 + +#define MAS4_TLBSELD(x) MAS0_TLBSEL(x) +#define MAS4_INDD 0x00008000 /* Default IND */ +#define MAS4_TSIZED(x) MAS1_TSIZE(x) +#define MAS4_X0D 0x00000040 +#define MAS4_X1D 0x00000020 +#define MAS4_WD 0x00000010 +#define MAS4_ID 0x00000008 +#define MAS4_MD 0x00000004 +#define MAS4_GD 0x00000002 +#define MAS4_ED 0x00000001 +#define MAS4_WIMGED_MASK 0x0000001f /* Default WIMGE */ +#define MAS4_WIMGED_SHIFT 0 +#define MAS4_VLED MAS4_X1D /* Default VLE */ +#define MAS4_ACMD 0x000000c0 /* Default ACM */ +#define MAS4_ACMD_SHIFT 6 +#define MAS4_TSIZED_MASK 0x00000f80 /* Default TSIZE */ +#define MAS4_TSIZED_SHIFT 7 + +#define MAS6_SPID0 0x3FFF0000 +#define MAS6_SPID1 0x00007FFE +#define MAS6_ISIZE(x) MAS1_TSIZE(x) +#define MAS6_SAS 0x00000001 +#define MAS6_SPID MAS6_SPID0 +#define MAS6_SIND 0x00000002 /* Indirect page */ +#define MAS6_SIND_SHIFT 1 +#define MAS6_SPID_MASK 0x3fff0000 +#define MAS6_SPID_SHIFT 16 +#define MAS6_ISIZE_MASK 0x00000f80 +#define MAS6_ISIZE_SHIFT 7 + +#define MAS7_RPN 0xFFFFFFFF + +/* TLBnCFG encoding */ +#define TLBnCFG_N_ENTRY 0x00000fff /* number of entries */ +#define TLBnCFG_HES 0x00002000 /* HW select supported */ +#define TLBnCFG_IPROT 0x00008000 /* IPROT supported */ +#define TLBnCFG_GTWE 0x00010000 /* Guest can write */ +#define TLBnCFG_IND 0x00020000 /* IND entries supported */ +#define TLBnCFG_PT 0x00040000 /* Can load from page table */ +#define TLBnCFG_ASSOC 0xff000000 /* Associativity */ + +/* TLBnPS encoding */ +#define TLBnPS_4K 0x00000004 +#define TLBnPS_8K 0x00000008 +#define TLBnPS_16K 0x00000010 +#define TLBnPS_32K 0x00000020 +#define TLBnPS_64K 0x00000040 +#define TLBnPS_128K 0x00000080 +#define TLBnPS_256K 0x00000100 +#define TLBnPS_512K 0x00000200 +#define TLBnPS_1M 0x00000400 +#define TLBnPS_2M 0x00000800 +#define TLBnPS_4M 0x00001000 +#define TLBnPS_8M 0x00002000 +#define TLBnPS_16M 0x00004000 +#define TLBnPS_32M 0x00008000 +#define TLBnPS_64M 0x00010000 +#define TLBnPS_128M 0x00020000 +#define TLBnPS_256M 0x00040000 +#define TLBnPS_512M 0x00080000 +#define TLBnPS_1G 0x00100000 +#define TLBnPS_2G 0x00200000 +#define TLBnPS_4G 0x00400000 +#define TLBnPS_8G 0x00800000 +#define TLBnPS_16G 0x01000000 +#define TLBnPS_32G 0x02000000 +#define TLBnPS_64G 0x04000000 +#define TLBnPS_128G 0x08000000 +#define TLBnPS_256G 0x10000000 + +/* tlbilx action encoding */ +#define TLBILX_T_ALL 0 +#define TLBILX_T_TID 1 +#define TLBILX_T_FULLMATCH 3 +#define TLBILX_T_CLASS0 4 +#define TLBILX_T_CLASS1 5 +#define TLBILX_T_CLASS2 6 +#define TLBILX_T_CLASS3 7 #ifndef __ASSEMBLY__ -- cgit v1.2.3-70-g09d2 From 44c58ccc8dc25f78a4f641901f17092c93dd0458 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:20 +0000 Subject: powerpc: Modify some ppc_asm.h macros to accomodate 64-bits Book3E The way I intend to use tophys/tovirt on 64-bit BookE is different from the "trick" that we currently play for 32-bit BookE so change the condition of definition of these macros to make it so. Also, make sure we only use rfid and mtmsrd instead of rfi and mtmsr for 64-bit server processors, not all 64-bit processors. Signed-off-by: Benjamin Herrenschmidt Acked-by: Kumar Gala --- arch/powerpc/include/asm/ppc_asm.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index f9729529c20..dfae6e916df 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -375,8 +375,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #define PPC440EP_ERR42 #endif - -#if defined(CONFIG_BOOKE) +/* + * toreal/fromreal/tophys/tovirt macros. 32-bit BookE makes them + * keep the address intact to be compatible with code shared with + * 32-bit classic. + * + * On the other hand, I find it useful to have them behave as expected + * by their name (ie always do the addition) on 64-bit BookE + */ +#if defined(CONFIG_BOOKE) && !defined(CONFIG_PPC64) #define toreal(rd) #define fromreal(rd) @@ -426,10 +433,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) .previous #endif -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 #define RFI rfid #define MTMSRD(r) mtmsrd r - #else #define FIX_SRR1(ra, rb) #ifndef CONFIG_40x -- cgit v1.2.3-70-g09d2 From d4e167da4cb60910f6ac305aee03714937f70b71 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:24 +0000 Subject: powerpc/mm: Make low level TLB flush ops on BookE take additional args We need to pass down whether the page is direct or indirect and we'll need to pass the page size to _tlbil_va and _tlbivax_bcast We also add a new low level _tlbil_pid_noind() which does a TLB flush by PID but avoids flushing indirect entries if possible This implements those new prototypes but defines them with inlines or macros so that no additional arguments are actually passed on current processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/tlbflush.h | 11 ++++++++-- arch/powerpc/mm/mmu_decl.h | 16 +++++++++++--- arch/powerpc/mm/tlb_nohash.c | 42 +++++++++++++++++++++++++++---------- arch/powerpc/mm/tlb_nohash_low.S | 6 +++--- 4 files changed, 56 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index abbe3419d1d..d50a380b2b6 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -6,7 +6,7 @@ * * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page - * - local_flush_tlb_mm(mm) flushes the specified mm context on + * - local_flush_tlb_mm(mm, full) flushes the specified mm context on * the local processor * - local_flush_tlb_page(vma, vmaddr) flushes one page on the local processor * - flush_tlb_page_nohash(vma, vmaddr) flushes one page if SW loaded TLB @@ -29,7 +29,8 @@ * specific tlbie's */ -#include +struct vm_area_struct; +struct mm_struct; #define MMU_NO_CONTEXT ((unsigned int)-1) @@ -40,12 +41,18 @@ extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void local_flush_tlb_mm(struct mm_struct *mm); extern void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind); + #ifdef CONFIG_SMP extern void flush_tlb_mm(struct mm_struct *mm); extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind); #else #define flush_tlb_mm(mm) local_flush_tlb_mm(mm) #define flush_tlb_page(vma,addr) local_flush_tlb_page(vma,addr) +#define __flush_tlb_page(mm,addr,p,i) __local_flush_tlb_page(mm,addr,p,i) #endif #define flush_tlb_page_nohash(vma,addr) flush_tlb_page(vma,addr) diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d1f9c62dc17..3871dceee2d 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -36,21 +36,30 @@ static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); } +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) + #else /* CONFIG_40x || CONFIG_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) #endif /* !(CONFIG_40x || CONFIG_8xx) */ /* * On 8xx, we directly inline tlbie, on others, it's extern */ #ifdef CONFIG_8xx -static inline void _tlbil_va(unsigned long address, unsigned int pid) +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); } #else /* CONFIG_8xx */ -extern void _tlbil_va(unsigned long address, unsigned int pid); +extern void __tlbil_va(unsigned long address, unsigned int pid); +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + __tlbil_va(address, pid); +} #endif /* CONIFG_8xx */ /* @@ -58,7 +67,8 @@ extern void _tlbil_va(unsigned long address, unsigned int pid); * implementation. When that becomes the case, this will be * an extern. */ -static inline void _tlbivax_bcast(unsigned long address, unsigned int pid) +static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { BUG(); } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d908e75cc3b..761e8882416 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -67,18 +67,24 @@ void local_flush_tlb_mm(struct mm_struct *mm) } EXPORT_SYMBOL(local_flush_tlb_mm); -void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) { unsigned int pid; preempt_disable(); - pid = vma ? vma->vm_mm->context.id : 0; + pid = mm ? mm->context.id : 0; if (pid != MMU_NO_CONTEXT) - _tlbil_va(vmaddr, pid); + _tlbil_va(vmaddr, pid, tsize, ind); preempt_enable(); } -EXPORT_SYMBOL(local_flush_tlb_page); +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + 0 /* tsize unused for now */, 0); +} +EXPORT_SYMBOL(local_flush_tlb_page); /* * And here are the SMP non-local implementations @@ -96,6 +102,8 @@ static int mm_is_core_local(struct mm_struct *mm) struct tlb_flush_param { unsigned long addr; unsigned int pid; + unsigned int tsize; + unsigned int ind; }; static void do_flush_tlb_mm_ipi(void *param) @@ -109,7 +117,7 @@ static void do_flush_tlb_page_ipi(void *param) { struct tlb_flush_param *p = param; - _tlbil_va(p->addr, p->pid); + _tlbil_va(p->addr, p->pid, p->tsize, p->ind); } @@ -149,37 +157,49 @@ void flush_tlb_mm(struct mm_struct *mm) } EXPORT_SYMBOL(flush_tlb_mm); -void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) { struct cpumask *cpu_mask; unsigned int pid; preempt_disable(); - pid = vma ? vma->vm_mm->context.id : 0; + pid = mm ? mm->context.id : 0; if (unlikely(pid == MMU_NO_CONTEXT)) goto bail; - cpu_mask = mm_cpumask(vma->vm_mm); + cpu_mask = mm_cpumask(mm); if (!mm_is_core_local(mm)) { /* If broadcast tlbivax is supported, use it */ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); if (lock) spin_lock(&tlbivax_lock); - _tlbivax_bcast(vmaddr, pid); + _tlbivax_bcast(vmaddr, pid, tsize, ind); if (lock) spin_unlock(&tlbivax_lock); goto bail; } else { - struct tlb_flush_param p = { .pid = pid, .addr = vmaddr }; + struct tlb_flush_param p = { + .pid = pid, + .addr = vmaddr, + .tsize = tsize, + .ind = ind, + }; /* Ignores smp_processor_id() even if set in cpu_mask */ smp_call_function_many(cpu_mask, do_flush_tlb_page_ipi, &p, 1); } } - _tlbil_va(vmaddr, pid); + _tlbil_va(vmaddr, pid, tsize, ind); bail: preempt_enable(); } + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + 0 /* tsize unused for now */, 0); +} EXPORT_SYMBOL(flush_tlb_page); #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 3037911279b..c7d89a0adba 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -39,7 +39,7 @@ /* * 40x implementation needs only tlbil_va */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) /* We run the search with interrupts disabled because we have to change * the PID and I don't want to preempt when that happens. */ @@ -71,7 +71,7 @@ _GLOBAL(_tlbil_va) * 440 implementation uses tlbsx/we for tlbil_va and a full sweep * of the TLB for everything else. */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) mfspr r5,SPRN_MMUCR rlwimi r5,r4,0,24,31 /* Set TID */ @@ -170,7 +170,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX) * Flush MMU TLB for a particular address, but only on the local processor * (no broadcast) */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) mfmsr r10 wrteei 0 slwi r4,r4,16 -- cgit v1.2.3-70-g09d2 From 6f0ef0f505af1ce6e9756087a9d4cc3778bae8c6 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:26 +0000 Subject: powerpc/mm: Call mmu_context_init() from ppc64 Our 64-bit hash context handling has no init function, but 64-bit Book3E will use the common mmu_context_nohash.c code which does, so define an empty inline mmu_context_init() for 64-bit server and call it from our 64-bit setup_arch() Signed-off-by: Benjamin Herrenschmidt Acked-by: Kumar Gala --- arch/powerpc/include/asm/mmu_context.h | 7 ++++++- arch/powerpc/kernel/setup_64.c | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b7063669f97..8dffed31701 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -14,7 +14,6 @@ /* * Most if the context management is out of line */ -extern void mmu_context_init(void); extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm); extern void destroy_context(struct mm_struct *mm); @@ -23,6 +22,12 @@ extern void switch_stab(struct task_struct *tsk, struct mm_struct *mm); extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); +#ifdef CONFIG_PPC_BOOK3S_64 +static inline void mmu_context_init(void) { } +#else +extern void mmu_context_init(void); +#endif + /* * switch_mm is the entry point called from the architecture independent * code in kernel/sched.c diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 91b89b8d63d..325dc5b2e62 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -534,6 +534,10 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); + + /* Initialize the MMU context management stuff */ + mmu_context_init(); + ppc64_boot_msg(0x15, "Setup Done"); } -- cgit v1.2.3-70-g09d2 From cf54dc7cd4f9aab55cd3e1794b0b74c3c88cd1a0 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:28 +0000 Subject: powerpc: Move definitions of secondary CPU spinloop to header file Those definitions are currently declared extern in the .c file where they are used, move them to a header file instead. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/smp.h | 9 +++++++++ arch/powerpc/kernel/prom_init.c | 4 ---- arch/powerpc/kernel/setup_64.c | 3 --- arch/powerpc/platforms/85xx/smp.c | 1 - arch/powerpc/platforms/86xx/mpc86xx_smp.c | 1 - arch/powerpc/platforms/cell/smp.c | 2 -- arch/powerpc/platforms/pseries/smp.c | 2 -- 7 files changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index c25f73d1d84..e782f43ee66 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -148,6 +148,15 @@ extern struct smp_ops_t *smp_ops; extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi(cpumask_t mask); +/* Definitions relative to the secondary CPU spin loop + * and entry point. Not all of them exist on both 32 and + * 64-bit but defining them all here doesn't harm + */ +extern void generic_secondary_smp_init(void); +extern unsigned long __secondary_hold_spinloop; +extern unsigned long __secondary_hold_acknowledge; +extern char __secondary_hold; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a538824616f..d942404779c 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1259,10 +1259,6 @@ static void __init prom_initialize_tce_table(void) * * -- Cort */ -extern char __secondary_hold; -extern unsigned long __secondary_hold_spinloop; -extern unsigned long __secondary_hold_acknowledge; - /* * We want to reference the copy of __secondary_hold_* in the * 0 - 0x100 address range diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 325dc5b2e62..a6b6c4c9ae4 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -230,9 +230,6 @@ void early_setup_secondary(void) #endif /* CONFIG_SMP */ #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC) -extern unsigned long __secondary_hold_spinloop; -extern void generic_secondary_smp_init(void); - void smp_release_cpus(void) { unsigned long *ptr; diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 62c592ede64..9f526ba31c1 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -25,7 +25,6 @@ #include -extern volatile unsigned long __secondary_hold_acknowledge; extern void __early_start(void); #define BOOT_ENTRY_ADDR_UPPER 0 diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c index d84bbb508ee..eacea0e3fcc 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c @@ -27,7 +27,6 @@ #include "mpc86xx.h" extern void __secondary_start_mpc86xx(void); -extern unsigned long __secondary_hold_acknowledge; #define MCM_PORT_CONFIG_OFFSET 0x10 diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index bc97fada48c..f774530075b 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -58,8 +58,6 @@ */ static cpumask_t of_spin_map; -extern void generic_secondary_smp_init(unsigned long); - /** * smp_startup_cpu() - start the given cpu * diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 1f8f6cfb94f..440000cc713 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -56,8 +56,6 @@ */ static cpumask_t of_spin_map; -extern void generic_secondary_smp_init(unsigned long); - /** * smp_startup_cpu() - start the given cpu * -- cgit v1.2.3-70-g09d2 From c7cc58a1ad8dfe3c199d3b6ce50412b86dd3edaf Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:28 +0000 Subject: powerpc/mm: Rework & cleanup page table freeing code path That patch used to just add a hook to page table flushing but pulling that string brought out a whole bunch of issues, so it now does that and more: - We now make the RCU batching of page freeing SMP only, as I believe it was intended initially. We make a few more things compile to nothing on !CONFIG_SMP - Some macros are turned into functions, though that forced me to out of line a few stuffs due to unsolvable include depenencies, however it's probably better that way anyway, it's not -that- critical code path. - 32-bit didn't call pte_free_finish() on tlb_flush() which means that it wouldn't push out the batch to RCU for delayed freeing when a bunch of page tables have been freed, they would just stay in there until the batch gets full. 64-bit BookE will use that hook to maintain the virtually linear page tables or the indirect entries in the TLB when using the HW loader. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgalloc.h | 39 +++++++++++++++++++++++++++----------- arch/powerpc/include/asm/tlb.h | 38 +++---------------------------------- arch/powerpc/mm/pgtable.c | 10 ++++++++++ arch/powerpc/mm/tlb_hash32.c | 3 +++ arch/powerpc/mm/tlb_hash64.c | 15 +++++++++++++++ arch/powerpc/mm/tlb_nohash.c | 8 ++++++++ 6 files changed, 67 insertions(+), 46 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 1730e5e298d..34b080671f0 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -4,6 +4,15 @@ #include +#ifdef CONFIG_PPC_BOOK3E +extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); +#else /* CONFIG_PPC_BOOK3E */ +static inline void tlb_flush_pgtable(struct mmu_gather *tlb, + unsigned long address) +{ +} +#endif /* !CONFIG_PPC_BOOK3E */ + static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); @@ -35,19 +44,27 @@ static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, #include #endif -extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); - #ifdef CONFIG_SMP -#define __pte_free_tlb(tlb,ptepage,address) \ -do { \ - pgtable_page_dtor(ptepage); \ - pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \ - PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \ -} while (0) -#else -#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, (pte)) -#endif +extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); +extern void pte_free_finish(void); +#else /* CONFIG_SMP */ +static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) +{ + pgtable_free(pgf); +} +static inline void pte_free_finish(void) { } +#endif /* !CONFIG_SMP */ +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, + unsigned long address) +{ + pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage), + PTE_NONCACHE_NUM, + PTE_TABLE_SIZE-1); + tlb_flush_pgtable(tlb, address); + pgtable_page_dtor(ptepage); + pgtable_free_tlb(tlb, pgf); +} #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index e20ff7541f3..e2b428b0f7b 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -25,57 +25,25 @@ #include -struct mmu_gather; - #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) -#if !defined(CONFIG_PPC_STD_MMU) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - -#elif defined(__powerpc64__) - -extern void pte_free_finish(void); - -static inline void tlb_flush(struct mmu_gather *tlb) -{ - struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch); - - /* If there's a TLB batch pending, then we must flush it because the - * pages are going to be freed and we really don't want to have a CPU - * access a freed page because it has a stale TLB - */ - if (tlbbatch->index) - __flush_tlb_pending(tlbbatch); - - pte_free_finish(); -} - -#else - extern void tlb_flush(struct mmu_gather *tlb); -#endif - /* Get the generic bits... */ #include -#if !defined(CONFIG_PPC_STD_MMU) || defined(__powerpc64__) - -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) - -#else extern void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long address); static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, - unsigned long address) + unsigned long address) { +#ifdef CONFIG_PPC_STD_MMU_32 if (pte_val(*ptep) & _PAGE_HASHPTE) flush_hash_entry(tlb->mm, ptep, address); +#endif } -#endif #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_TLB_H */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 627767d6169..a65979a5f75 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,14 @@ #include #include +#ifdef CONFIG_SMP + +/* + * Handle batching of page table freeing on SMP. Page tables are + * queued up and send to be freed later by RCU in order to avoid + * freeing a page table page that is being walked without locks + */ + static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); static unsigned long pte_freelist_forced_free; @@ -116,6 +124,8 @@ void pte_free_finish(void) *batchp = NULL; } +#endif /* CONFIG_SMP */ + /* * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags() */ diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 65190587a36..8aaa8b7eb32 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -71,6 +71,9 @@ void tlb_flush(struct mmu_gather *tlb) */ _tlbia(); } + + /* Push out batch of freed page tables */ + pte_free_finish(); } /* diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 937eb90677d..8e35a606693 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -154,6 +154,21 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } +void tlb_flush(struct mmu_gather *tlb) +{ + struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch); + + /* If there's a TLB batch pending, then we must flush it because the + * pages are going to be freed and we really don't want to have a CPU + * access a freed page because it has a stale TLB + */ + if (tlbbatch->index) + __flush_tlb_pending(tlbbatch); + + /* Push out batch of freed page tables */ + pte_free_finish(); +} + /** * __flush_hash_table_range - Flush all HPTEs for a given address range * from the hash table (and the TLB). But keeps diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 761e8882416..6b43fc49f10 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -233,3 +233,11 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_mm(vma->vm_mm); } EXPORT_SYMBOL(flush_tlb_range); + +void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); + + /* Push out batch of freed page tables */ + pte_free_finish(); +} -- cgit v1.2.3-70-g09d2 From 0257c99cdfaca53a881339e1cbca638c61569b05 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:34 +0000 Subject: powerpc: Add SPR definitions for new 64-bit BookE This adds various SPRs defined on 64-bit BookE, along with changes to the definition of the base MSR values to add the values needed for 64-bit Book3E. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/reg.h | 10 +++------ arch/powerpc/include/asm/reg_booke.h | 42 +++++++++++++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 2cedbb42761..c8715331e1b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -98,19 +98,15 @@ #define MSR_RI __MASK(MSR_RI_LG) /* Recoverable Exception */ #define MSR_LE __MASK(MSR_LE_LG) /* Little Endian */ -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC_BOOK3S_64) +/* Server variant */ #define MSR_ MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_ISF |MSR_HV #define MSR_KERNEL MSR_ | MSR_SF - #define MSR_USER32 MSR_ | MSR_PR | MSR_EE #define MSR_USER64 MSR_USER32 | MSR_SF - -#else /* 32-bit */ +#elif defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_8xx) /* Default MSR for kernel mode. */ -#ifndef MSR_KERNEL /* reg_booke.h also defines this */ #define MSR_KERNEL (MSR_ME|MSR_RI|MSR_IR|MSR_DR) -#endif - #define MSR_USER (MSR_KERNEL|MSR_PR|MSR_EE) #endif diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 6bcf364cbb2..2c9c706e644 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -18,18 +18,26 @@ #define MSR_IS MSR_IR /* Instruction Space */ #define MSR_DS MSR_DR /* Data Space */ #define MSR_PMM (1<<2) /* Performance monitor mark bit */ +#define MSR_CM (1<<31) /* Computation Mode (0=32-bit, 1=64-bit) */ -/* Default MSR for kernel mode. */ -#if defined (CONFIG_40x) +#if defined(CONFIG_PPC_BOOK3E_64) +#define MSR_ MSR_ME | MSR_CE +#define MSR_KERNEL MSR_ | MSR_CM +#define MSR_USER32 MSR_ | MSR_PR | MSR_EE +#define MSR_USER64 MSR_USER32 | MSR_CM +#elif defined (CONFIG_40x) #define MSR_KERNEL (MSR_ME|MSR_RI|MSR_IR|MSR_DR|MSR_CE) -#elif defined(CONFIG_BOOKE) +#define MSR_USER (MSR_KERNEL|MSR_PR|MSR_EE) +#else #define MSR_KERNEL (MSR_ME|MSR_RI|MSR_CE) +#define MSR_USER (MSR_KERNEL|MSR_PR|MSR_EE) #endif /* Special Purpose Registers (SPRNs)*/ #define SPRN_DECAR 0x036 /* Decrementer Auto Reload Register */ #define SPRN_IVPR 0x03F /* Interrupt Vector Prefix Register */ #define SPRN_USPRG0 0x100 /* User Special Purpose Register General 0 */ +#define SPRN_SPRG3R 0x103 /* Special Purpose Register General 3 Read */ #define SPRN_SPRG4R 0x104 /* Special Purpose Register General 4 Read */ #define SPRN_SPRG5R 0x105 /* Special Purpose Register General 5 Read */ #define SPRN_SPRG6R 0x106 /* Special Purpose Register General 6 Read */ @@ -38,11 +46,18 @@ #define SPRN_SPRG5W 0x115 /* Special Purpose Register General 5 Write */ #define SPRN_SPRG6W 0x116 /* Special Purpose Register General 6 Write */ #define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */ +#define SPRN_EPCR 0x133 /* Embedded Processor Control Register */ #define SPRN_DBCR2 0x136 /* Debug Control Register 2 */ #define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */ #define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */ #define SPRN_DVC1 0x13E /* Data Value Compare Register 1 */ #define SPRN_DVC2 0x13F /* Data Value Compare Register 2 */ +#define SPRN_MAS8 0x155 /* MMU Assist Register 8 */ +#define SPRN_TLB0PS 0x158 /* TLB 0 Page Size Register */ +#define SPRN_MAS5_MAS6 0x15c /* MMU Assist Register 5 || 6 */ +#define SPRN_MAS8_MAS1 0x15d /* MMU Assist Register 8 || 1 */ +#define SPRN_MAS7_MAS3 0x174 /* MMU Assist Register 7 || 3 */ +#define SPRN_MAS0_MAS1 0x175 /* MMU Assist Register 0 || 1 */ #define SPRN_IVOR0 0x190 /* Interrupt Vector Offset Register 0 */ #define SPRN_IVOR1 0x191 /* Interrupt Vector Offset Register 1 */ #define SPRN_IVOR2 0x192 /* Interrupt Vector Offset Register 2 */ @@ -425,6 +440,27 @@ #define SGR_NORMAL 0 /* Speculative fetching allowed. */ #define SGR_GUARDED 1 /* Speculative fetching disallowed. */ +/* Bit definitions for EPCR */ +#define SPRN_EPCR_EXTGS 0x80000000 /* External Input interrupt + * directed to Guest state */ +#define SPRN_EPCR_DTLBGS 0x40000000 /* Data TLB Error interrupt + * directed to guest state */ +#define SPRN_EPCR_ITLBGS 0x20000000 /* Instr. TLB error interrupt + * directed to guest state */ +#define SPRN_EPCR_DSIGS 0x10000000 /* Data Storage interrupt + * directed to guest state */ +#define SPRN_EPCR_ISIGS 0x08000000 /* Instr. Storage interrupt + * directed to guest state */ +#define SPRN_EPCR_DUVD 0x04000000 /* Disable Hypervisor Debug */ +#define SPRN_EPCR_ICM 0x02000000 /* Interrupt computation mode + * (copied to MSR:CM on intr) */ +#define SPRN_EPCR_GICM 0x01000000 /* Guest Interrupt Comp. mode */ +#define SPRN_EPCR_DGTMI 0x00800000 /* Disable TLB Guest Management + * instructions */ +#define SPRN_EPCR_DMIUH 0x00400000 /* Disable MAS Interrupt updates + * for hypervisor */ + + /* * The IBM-403 is an even more odd special case, as it is much * older than the IBM-405 series. We put these down here incase someone -- cgit v1.2.3-70-g09d2 From 57e2a99f74b0d3720c97a6aadb57ae6aad3c61ea Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Jul 2009 11:59:34 +1000 Subject: powerpc: Add memory management headers for new 64-bit BookE This adds the PTE and pgtable format definitions, along with changes to the kernel memory map and other definitions related to implementing support for 64-bit Book3E. This also shields some asm-offset bits that are currently only relevant on 32-bit We also move the definition of the "linux" page size constants to the common mmu.h file and add a few sizes that are relevant to embedded processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-book3e.h | 27 ++++++++++++ arch/powerpc/include/asm/mmu-hash64.h | 20 --------- arch/powerpc/include/asm/mmu.h | 37 +++++++++++++++++ arch/powerpc/include/asm/page.h | 4 ++ arch/powerpc/include/asm/page_64.h | 10 +++++ arch/powerpc/include/asm/pgtable-ppc64.h | 61 ++++++++++++++++++++-------- arch/powerpc/include/asm/pte-book3e.h | 70 ++++++++++++++++++++++++++++++++ arch/powerpc/include/asm/pte-common.h | 3 ++ arch/powerpc/kernel/asm-offsets.c | 5 ++- arch/powerpc/mm/hugetlbpage.c | 8 +++- 10 files changed, 205 insertions(+), 40 deletions(-) create mode 100644 arch/powerpc/include/asm/pte-book3e.h (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 42a39b4aace..6ddbe48d07f 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -170,6 +170,33 @@ typedef struct { unsigned int active; unsigned long vdso_base; } mm_context_t; + +/* Page size definitions, common between 32 and 64-bit + * + * shift : is the "PAGE_SHIFT" value for that page size + * penc : is the pte encoding mask + * + */ +struct mmu_psize_def +{ + unsigned int shift; /* number of bits */ + unsigned int enc; /* PTE encoding */ +}; +extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + +/* The page sizes use the same names as 64-bit hash but are + * constants + */ +#if defined(CONFIG_PPC_4K_PAGES) +#define mmu_virtual_psize MMU_PAGE_4K +#elif defined(CONFIG_PPC_64K_PAGES) +#define mmu_virtual_psize MMU_PAGE_64K +#else +#error Unsupported page size +#endif + +extern int mmu_linear_psize; + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */ diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 98c104a0996..b537903b9fc 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -138,26 +138,6 @@ struct mmu_psize_def #endif /* __ASSEMBLY__ */ -/* - * The kernel use the constants below to index in the page sizes array. - * The use of fixed constants for this purpose is better for performances - * of the low level hash refill handlers. - * - * A non supported page size has a "shift" field set to 0 - * - * Any new page size being implemented can get a new entry in here. Whether - * the kernel will use it or not is a different matter though. The actual page - * size used by hugetlbfs is not defined here and may be made variable - */ - -#define MMU_PAGE_4K 0 /* 4K */ -#define MMU_PAGE_64K 1 /* 64K */ -#define MMU_PAGE_64K_AP 2 /* 64K Admixed (in a 4K segment) */ -#define MMU_PAGE_1M 3 /* 1M */ -#define MMU_PAGE_16M 4 /* 16M */ -#define MMU_PAGE_16G 5 /* 16G */ -#define MMU_PAGE_COUNT 6 - /* * Segment sizes. * These are the values used by hardware in the B field of diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index fb57ded592f..2fcfefc6089 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -17,6 +17,7 @@ #define MMU_FTR_TYPE_40x ASM_CONST(0x00000004) #define MMU_FTR_TYPE_44x ASM_CONST(0x00000008) #define MMU_FTR_TYPE_FSL_E ASM_CONST(0x00000010) +#define MMU_FTR_TYPE_3E ASM_CONST(0x00000020) /* * This is individual features @@ -73,6 +74,41 @@ extern void early_init_mmu_secondary(void); #endif /* !__ASSEMBLY__ */ +/* The kernel use the constants below to index in the page sizes array. + * The use of fixed constants for this purpose is better for performances + * of the low level hash refill handlers. + * + * A non supported page size has a "shift" field set to 0 + * + * Any new page size being implemented can get a new entry in here. Whether + * the kernel will use it or not is a different matter though. The actual page + * size used by hugetlbfs is not defined here and may be made variable + * + * Note: This array ended up being a false good idea as it's growing to the + * point where I wonder if we should replace it with something different, + * to think about, feedback welcome. --BenH. + */ + +/* There are #define as they have to be used in assembly + * + * WARNING: If you change this list, make sure to update the array of + * names currently in arch/powerpc/mm/hugetlbpage.c or bad things will + * happen + */ +#define MMU_PAGE_4K 0 +#define MMU_PAGE_16K 1 +#define MMU_PAGE_64K 2 +#define MMU_PAGE_64K_AP 3 /* "Admixed pages" (hash64 only) */ +#define MMU_PAGE_256K 4 +#define MMU_PAGE_1M 5 +#define MMU_PAGE_8M 6 +#define MMU_PAGE_16M 7 +#define MMU_PAGE_256M 8 +#define MMU_PAGE_1G 9 +#define MMU_PAGE_16G 10 +#define MMU_PAGE_64G 11 +#define MMU_PAGE_COUNT 12 + #if defined(CONFIG_PPC_STD_MMU_64) /* 64-bit classic hash table MMU */ @@ -94,5 +130,6 @@ extern void early_init_mmu_secondary(void); # include #endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 4940662ee87..ff24254990e 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -139,7 +139,11 @@ extern phys_addr_t kernstart_addr; * Don't compare things with KERNELBASE or PAGE_OFFSET to test for * "kernelness", use is_kernel_addr() - it should do what you want. */ +#ifdef CONFIG_PPC_BOOK3E_64 +#define is_kernel_addr(x) ((x) >= 0x8000000000000000ul) +#else #define is_kernel_addr(x) ((x) >= PAGE_OFFSET) +#endif #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 5817a3b747e..3f17b83f55a 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -135,12 +135,22 @@ extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, #endif /* __ASSEMBLY__ */ #else #define slice_init() +#ifdef CONFIG_PPC_STD_MMU_64 #define get_slice_psize(mm, addr) ((mm)->context.user_psize) #define slice_set_user_psize(mm, psize) \ do { \ (mm)->context.user_psize = (psize); \ (mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \ } while (0) +#else /* CONFIG_PPC_STD_MMU_64 */ +#ifdef CONFIG_PPC_64K_PAGES +#define get_slice_psize(mm, addr) MMU_PAGE_64K +#else /* CONFIG_PPC_64K_PAGES */ +#define get_slice_psize(mm, addr) MMU_PAGE_4K +#endif /* !CONFIG_PPC_64K_PAGES */ +#define slice_set_user_psize(mm, psize) do { BUG(); } while(0) +#endif /* !CONFIG_PPC_STD_MMU_64 */ + #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) #define slice_mm_new_context(mm) 1 diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 8cd083c6150..7254c5a3187 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -5,11 +5,6 @@ * the ppc64 hashed page table. */ -#ifndef __ASSEMBLY__ -#include -#include -#endif /* __ASSEMBLY__ */ - #ifdef CONFIG_PPC_64K_PAGES #include #else @@ -38,26 +33,46 @@ #endif /* - * Define the address range of the vmalloc VM area. + * Define the address range of the kernel non-linear virtual area + */ + +#ifdef CONFIG_PPC_BOOK3E +#define KERN_VIRT_START ASM_CONST(0x8000000000000000) +#else +#define KERN_VIRT_START ASM_CONST(0xD000000000000000) +#endif +#define KERN_VIRT_SIZE PGTABLE_RANGE + +/* + * The vmalloc space starts at the beginning of that region, and + * occupies half of it on hash CPUs and a quarter of it on Book3E */ -#define VMALLOC_START ASM_CONST(0xD000000000000000) -#define VMALLOC_SIZE (PGTABLE_RANGE >> 1) -#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) +#define VMALLOC_START KERN_VIRT_START +#ifdef CONFIG_PPC_BOOK3E +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 2) +#else +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) +#endif +#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* - * Define the address ranges for MMIO and IO space : + * The second half of the kernel virtual space is used for IO mappings, + * it's itself carved into the PIO region (ISA and PHB IO space) and + * the ioremap space * - * ISA_IO_BASE = VMALLOC_END, 64K reserved area + * ISA_IO_BASE = KERN_IO_START, 64K reserved area * PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE */ +#define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1)) #define FULL_IO_SIZE 0x80000000ul -#define ISA_IO_BASE (VMALLOC_END) -#define ISA_IO_END (VMALLOC_END + 0x10000ul) +#define ISA_IO_BASE (KERN_IO_START) +#define ISA_IO_END (KERN_IO_START + 0x10000ul) #define PHB_IO_BASE (ISA_IO_END) -#define PHB_IO_END (VMALLOC_END + FULL_IO_SIZE) +#define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) -#define IOREMAP_END (VMALLOC_START + PGTABLE_RANGE) +#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) + /* * Region IDs @@ -72,19 +87,28 @@ #define USER_REGION_ID (0UL) /* - * Defines the address of the vmemap area, in its own region + * Defines the address of the vmemap area, in its own region on + * hash table CPUs and after the vmalloc space on Book3E */ +#ifdef CONFIG_PPC_BOOK3E +#define VMEMMAP_BASE VMALLOC_END +#define VMEMMAP_END KERN_IO_START +#else #define VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) +#endif #define vmemmap ((struct page *)VMEMMAP_BASE) /* * Include the PTE bits definitions */ +#ifdef CONFIG_PPC_BOOK3S #include +#else +#include +#endif #include - #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN @@ -92,6 +116,9 @@ #ifndef __ASSEMBLY__ +#include +#include + /* * This is the default implementation of various PTE accessors, it's * used in all cases except Book3S with 64K pages where we have a diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h new file mode 100644 index 00000000000..1d27c77d770 --- /dev/null +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -0,0 +1,70 @@ +#ifndef _ASM_POWERPC_PTE_BOOK3E_H +#define _ASM_POWERPC_PTE_BOOK3E_H +#ifdef __KERNEL__ + +/* PTE bit definitions for processors compliant to the Book3E + * architecture 2.06 or later. The position of the PTE bits + * matches the HW definition of the optional Embedded Page Table + * category. + */ + +/* Architected bits */ +#define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ +#define _PAGE_FILE 0x000002 /* (!present only) software: pte holds file offset */ +#define _PAGE_SW1 0x000002 +#define _PAGE_BAP_SR 0x000004 +#define _PAGE_BAP_UR 0x000008 +#define _PAGE_BAP_SW 0x000010 +#define _PAGE_BAP_UW 0x000020 +#define _PAGE_BAP_SX 0x000040 +#define _PAGE_BAP_UX 0x000080 +#define _PAGE_PSIZE_MSK 0x000f00 +#define _PAGE_PSIZE_4K 0x000200 +#define _PAGE_PSIZE_64K 0x000600 +#define _PAGE_PSIZE_1M 0x000a00 +#define _PAGE_PSIZE_16M 0x000e00 +#define _PAGE_DIRTY 0x001000 /* C: page changed */ +#define _PAGE_SW0 0x002000 +#define _PAGE_U3 0x004000 +#define _PAGE_U2 0x008000 +#define _PAGE_U1 0x010000 +#define _PAGE_U0 0x020000 +#define _PAGE_ACCESSED 0x040000 +#define _PAGE_LENDIAN 0x080000 +#define _PAGE_GUARDED 0x100000 +#define _PAGE_COHERENT 0x200000 /* M: enforce memory coherence */ +#define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */ +#define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */ + +/* "Higher level" linux bit combinations */ +#define _PAGE_EXEC _PAGE_BAP_SX /* Can be executed from potentially */ +#define _PAGE_HWEXEC _PAGE_BAP_UX /* .. and was cache cleaned */ +#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ +#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) +#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ + +#define _PAGE_HASHPTE 0 +#define _PAGE_BUSY 0 + +#define _PAGE_SPECIAL _PAGE_SW0 + +/* Flags to be preserved on PTE modifications */ +#define _PAGE_HPTEFLAGS _PAGE_BUSY + +/* Base page size */ +#ifdef CONFIG_PPC_64K_PAGES +#define _PAGE_PSIZE _PAGE_PSIZE_64K +#define PTE_RPN_SHIFT (28) +#else +#define _PAGE_PSIZE _PAGE_PSIZE_4K +#define PTE_RPN_SHIFT (24) +#endif + +/* On 32-bit, we never clear the top part of the PTE */ +#ifdef CONFIG_PPC32 +#define _PTE_NONE_MASK 0xffffffff00000000ULL +#endif + +#endif /* __KERNEL__ */ +#endif /* _ASM_POWERPC_PTE_FSL_BOOKE_H */ diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index a7e210b6b48..8bb6464ba61 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -34,6 +34,9 @@ #ifndef _PAGE_4K_PFN #define _PAGE_4K_PFN 0 #endif +#ifndef _PAGE_SAO +#define _PAGE_SAO 0 +#endif #ifndef _PAGE_PSIZE #define _PAGE_PSIZE 0 #endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 561b6465231..0a9f30b5495 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -52,9 +52,11 @@ #include #endif +#ifdef CONFIG_PPC32 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) #include "head_booke.h" #endif +#endif #if defined(CONFIG_FSL_BOOKE) #include "../mm/mmu_decl.h" @@ -260,6 +262,7 @@ int main(void) DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8); #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_PPC32) #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); @@ -278,7 +281,7 @@ int main(void) DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1)); DEFINE(SAVED_KSP_LIMIT, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, saved_ksp_limit)); #endif - +#endif DEFINE(CLONE_VM, CLONE_VM); DEFINE(CLONE_UNTRACED, CLONE_UNTRACED); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index c46ef2ffa3d..90df6ffe3a4 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -57,8 +57,10 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { - "unused_4K", "hugepte_cache_64K", "unused_64K_AP", - "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" + [MMU_PAGE_64K] = "hugepte_cache_64K", + [MMU_PAGE_1M] = "hugepte_cache_1M", + [MMU_PAGE_16M] = "hugepte_cache_16M", + [MMU_PAGE_16G] = "hugepte_cache_16G", }; /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() @@ -700,6 +702,8 @@ static void __init set_huge_psize(int psize) if (mmu_huge_psizes[psize] || mmu_psize_defs[psize].shift == PAGE_SHIFT) return; + if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) + return; hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); switch (mmu_psize_defs[psize].shift) { -- cgit v1.2.3-70-g09d2 From 13363ab9b9d040ebeace3a1a3a5ddcb13bf0d644 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:39 +0000 Subject: powerpc: Add definitions used by exception handling on 64-bit Book3E This adds various definitions and macros used by the exception and TLB miss handling on 64-bit BookE It also adds the definitions of the SPRGs used for various exception types Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64e.h | 201 +++++++++++++++++++++++++++++++ arch/powerpc/include/asm/reg.h | 19 +++ 2 files changed, 220 insertions(+) create mode 100644 arch/powerpc/include/asm/exception-64e.h (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h new file mode 100644 index 00000000000..94cb3d79d12 --- /dev/null +++ b/arch/powerpc/include/asm/exception-64e.h @@ -0,0 +1,201 @@ +/* + * Definitions for use by exception code on Book3-E + * + * Copyright (C) 2008 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _ASM_POWERPC_EXCEPTION_64E_H +#define _ASM_POWERPC_EXCEPTION_64E_H + +/* + * SPRGs usage an other considerations... + * + * Since TLB miss and other standard exceptions can be interrupted by + * critical exceptions which can themselves be interrupted by machine + * checks, and since the two later can themselves cause a TLB miss when + * hitting the linear mapping for the kernel stacks, we need to be a bit + * creative on how we use SPRGs. + * + * The base idea is that we have one SRPG reserved for critical and one + * for machine check interrupts. Those are used to save a GPR that can + * then be used to get the PACA, and store as much context as we need + * to save in there. That includes saving the SPRGs used by the TLB miss + * handler for linear mapping misses and the associated SRR0/1 due to + * the above re-entrancy issue. + * + * So here's the current usage pattern. It's done regardless of which + * SPRGs are user-readable though, thus we might have to change some of + * this later. In order to do that more easily, we use special constants + * for naming them + * + * WARNING: Some of these SPRGs are user readable. We need to do something + * about it as some point by making sure they can't be used to leak kernel + * critical data + */ + + +/* We are out of SPRGs so we save some things in the PACA. The normal + * exception frame is smaller than the CRIT or MC one though + */ +#define EX_R1 (0 * 8) +#define EX_CR (1 * 8) +#define EX_R10 (2 * 8) +#define EX_R11 (3 * 8) +#define EX_R14 (4 * 8) +#define EX_R15 (5 * 8) + +/* The TLB miss exception uses different slots */ + +#define EX_TLB_R10 ( 0 * 8) +#define EX_TLB_R11 ( 1 * 8) +#define EX_TLB_R12 ( 2 * 8) +#define EX_TLB_R13 ( 3 * 8) +#define EX_TLB_R14 ( 4 * 8) +#define EX_TLB_R15 ( 5 * 8) +#define EX_TLB_R16 ( 6 * 8) +#define EX_TLB_CR ( 7 * 8) +#define EX_TLB_DEAR ( 8 * 8) /* Level 0 and 2 only */ +#define EX_TLB_ESR ( 9 * 8) /* Level 0 and 2 only */ +#define EX_TLB_SRR0 (10 * 8) +#define EX_TLB_SRR1 (11 * 8) +#define EX_TLB_MMUCR0 (12 * 8) /* Level 0 */ +#define EX_TLB_MAS1 (12 * 8) /* Level 0 */ +#define EX_TLB_MAS2 (13 * 8) /* Level 0 */ +#ifdef CONFIG_BOOK3E_MMU_TLB_STATS +#define EX_TLB_R8 (14 * 8) +#define EX_TLB_R9 (15 * 8) +#define EX_TLB_LR (16 * 8) +#define EX_TLB_SIZE (17 * 8) +#else +#define EX_TLB_SIZE (14 * 8) +#endif + +#define START_EXCEPTION(label) \ + .globl exc_##label##_book3e; \ +exc_##label##_book3e: + +/* TLB miss exception prolog + * + * This prolog handles re-entrancy (up to 3 levels supported in the PACA + * though we currently don't test for overflow). It provides you with a + * re-entrancy safe working space of r10...r16 and CR with r12 being used + * as the exception area pointer in the PACA for that level of re-entrancy + * and r13 containing the PACA pointer. + * + * SRR0 and SRR1 are saved, but DEAR and ESR are not, since they don't apply + * as-is for instruction exceptions. It's up to the actual exception code + * to save them as well if required. + */ +#define TLB_MISS_PROLOG \ + mtspr SPRN_SPRG_TLB_SCRATCH,r12; \ + mfspr r12,SPRN_SPRG_TLB_EXFRAME; \ + std r10,EX_TLB_R10(r12); \ + mfcr r10; \ + std r11,EX_TLB_R11(r12); \ + mfspr r11,SPRN_SPRG_TLB_SCRATCH; \ + std r13,EX_TLB_R13(r12); \ + mfspr r13,SPRN_SPRG_PACA; \ + std r14,EX_TLB_R14(r12); \ + addi r14,r12,EX_TLB_SIZE; \ + std r15,EX_TLB_R15(r12); \ + mfspr r15,SPRN_SRR1; \ + std r16,EX_TLB_R16(r12); \ + mfspr r16,SPRN_SRR0; \ + std r10,EX_TLB_CR(r12); \ + std r11,EX_TLB_R12(r12); \ + mtspr SPRN_SPRG_TLB_EXFRAME,r14; \ + std r15,EX_TLB_SRR1(r12); \ + std r16,EX_TLB_SRR0(r12); \ + TLB_MISS_PROLOG_STATS + +/* And these are the matching epilogs that restores things + * + * There are 3 epilogs: + * + * - SUCCESS : Unwinds one level + * - ERROR : restore from level 0 and reset + * - ERROR_SPECIAL : restore from current level and reset + * + * Normal errors use ERROR, that is, they restore the initial fault context + * and trigger a fault. However, there is a special case for linear mapping + * errors. Those should basically never happen, but if they do happen, we + * want the error to point out the context that did that linear mapping + * fault, not the initial level 0 (basically, we got a bogus PGF or something + * like that). For userland errors on the linear mapping, there is no + * difference since those are always level 0 anyway + */ + +#define TLB_MISS_RESTORE(freg) \ + ld r14,EX_TLB_CR(r12); \ + ld r10,EX_TLB_R10(r12); \ + ld r15,EX_TLB_SRR0(r12); \ + ld r16,EX_TLB_SRR1(r12); \ + mtspr SPRN_SPRG_TLB_EXFRAME,freg; \ + ld r11,EX_TLB_R11(r12); \ + mtcr r14; \ + ld r13,EX_TLB_R13(r12); \ + ld r14,EX_TLB_R14(r12); \ + mtspr SPRN_SRR0,r15; \ + ld r15,EX_TLB_R15(r12); \ + mtspr SPRN_SRR1,r16; \ + TLB_MISS_RESTORE_STATS \ + ld r16,EX_TLB_R16(r12); \ + ld r12,EX_TLB_R12(r12); \ + +#define TLB_MISS_EPILOG_SUCCESS \ + TLB_MISS_RESTORE(r12) + +#define TLB_MISS_EPILOG_ERROR \ + addi r12,r13,PACA_EXTLB; \ + TLB_MISS_RESTORE(r12) + +#define TLB_MISS_EPILOG_ERROR_SPECIAL \ + addi r11,r13,PACA_EXTLB; \ + TLB_MISS_RESTORE(r11) + +#ifdef CONFIG_BOOK3E_MMU_TLB_STATS +#define TLB_MISS_PROLOG_STATS \ + mflr r10; \ + std r8,EX_TLB_R8(r12); \ + std r9,EX_TLB_R9(r12); \ + std r10,EX_TLB_LR(r12); +#define TLB_MISS_RESTORE_STATS \ + ld r16,EX_TLB_LR(r12); \ + ld r9,EX_TLB_R9(r12); \ + ld r8,EX_TLB_R8(r12); \ + mtlr r16; +#define TLB_MISS_STATS_D(name) \ + addi r9,r13,MMSTAT_DSTATS+name; \ + bl .tlb_stat_inc; +#define TLB_MISS_STATS_I(name) \ + addi r9,r13,MMSTAT_ISTATS+name; \ + bl .tlb_stat_inc; +#define TLB_MISS_STATS_X(name) \ + ld r8,PACA_EXTLB+EX_TLB_ESR(r13); \ + cmpdi cr2,r8,-1; \ + beq cr2,61f; \ + addi r9,r13,MMSTAT_DSTATS+name; \ + b 62f; \ +61: addi r9,r13,MMSTAT_ISTATS+name; \ +62: bl .tlb_stat_inc; +#define TLB_MISS_STATS_SAVE_INFO \ + std r14,EX_TLB_ESR(r12); /* save ESR */ \ + + +#else +#define TLB_MISS_PROLOG_STATS +#define TLB_MISS_RESTORE_STATS +#define TLB_MISS_STATS_D(name) +#define TLB_MISS_STATS_I(name) +#define TLB_MISS_STATS_X(name) +#define TLB_MISS_STATS_Y(name) +#define TLB_MISS_STATS_SAVE_INFO +#endif + + +#endif /* _ASM_POWERPC_EXCEPTION_64E_H */ + diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c8715331e1b..6315edc205d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -652,6 +652,16 @@ * - SPRG2 scratch for exception vectors * - SPRG3 unused (user visible) * + * 64-bit embedded + * - SPRG0 generic exception scratch + * - SPRG2 TLB exception stack + * - SPRG3 unused (user visible) + * - SPRG4 unused (user visible) + * - SPRG6 TLB miss scratch (user visible, sorry !) + * - SPRG7 critical exception scratch + * - SPRG8 machine check exception scratch + * - SPRG9 debug exception scratch + * * All 32-bit: * - SPRG3 current thread_info pointer * (virtual on BookE, physical on others) @@ -705,6 +715,15 @@ #define SPRN_SPRG_SCRATCH0 SPRN_SPRG2 #endif +#ifdef CONFIG_PPC_BOOK3E_64 +#define SPRN_SPRG_MC_SCRATCH SPRN_SPRG8 +#define SPRN_SPRG_CRIT_SCRATCH SPRN_SPRG7 +#define SPRN_SPRG_DBG_SCRATCH SPRN_SPRG9 +#define SPRN_SPRG_TLB_EXFRAME SPRN_SPRG2 +#define SPRN_SPRG_TLB_SCRATCH SPRN_SPRG6 +#define SPRN_SPRG_GEN_SCRATCH SPRN_SPRG0 +#endif + #ifdef CONFIG_PPC_BOOK3S_32 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 #define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 -- cgit v1.2.3-70-g09d2 From dce6670aaa7efece0558010b48d5ef9d421154be Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:42 +0000 Subject: powerpc: Add PACA fields specific to 64-bit Book3E processors This adds various fields in the PACA that are for use specifically by Book3E processors, such as exception save areas, current pgd pointer, special exceptions kernel stacks etc... Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/paca.h | 23 ++++++++++++++++++++--- arch/powerpc/kernel/asm-offsets.c | 14 ++++++++++++++ arch/powerpc/kernel/paca.c | 3 +++ 3 files changed, 37 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index c8a3cbfe02f..b634456ea89 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -14,9 +14,11 @@ #define _ASM_POWERPC_PACA_H #ifdef __KERNEL__ -#include -#include -#include +#include +#include +#include +#include +#include register struct paca_struct *local_paca asm("r13"); @@ -91,6 +93,21 @@ struct paca_struct { u16 slb_cache[SLB_CACHE_ENTRIES]; #endif /* CONFIG_PPC_STD_MMU_64 */ +#ifdef CONFIG_PPC_BOOK3E + pgd_t *pgd; /* Current PGD */ + pgd_t *kernel_pgd; /* Kernel PGD */ + u64 exgen[8] __attribute__((aligned(0x80))); + u64 extlb[EX_TLB_SIZE*3] __attribute__((aligned(0x80))); + u64 exmc[8]; /* used for machine checks */ + u64 excrit[8]; /* used for crit interrupts */ + u64 exdbg[8]; /* used for debug interrupts */ + + /* Kernel stack pointers for use by special exceptions */ + void *mc_kstack; + void *crit_kstack; + void *dbg_kstack; +#endif /* CONFIG_PPC_BOOK3E */ + mm_context_t context; /* diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0a9f30b5495..b9e010d0fc9 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -140,6 +140,20 @@ int main(void) context.high_slices_psize)); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ + +#ifdef CONFIG_PPC_BOOK3E + DEFINE(PACAPGD, offsetof(struct paca_struct, pgd)); + DEFINE(PACA_KERNELPGD, offsetof(struct paca_struct, kernel_pgd)); + DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); + DEFINE(PACA_EXTLB, offsetof(struct paca_struct, extlb)); + DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc)); + DEFINE(PACA_EXCRIT, offsetof(struct paca_struct, excrit)); + DEFINE(PACA_EXDBG, offsetof(struct paca_struct, exdbg)); + DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack)); + DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack)); + DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack)); +#endif /* CONFIG_PPC_BOOK3E */ + #ifdef CONFIG_PPC_STD_MMU_64 DEFINE(PACASTABREAL, offsetof(struct paca_struct, stab_real)); DEFINE(PACASTABVIRT, offsetof(struct paca_struct, stab_addr)); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index e9962c7f8a0..d16b1ea55d4 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -13,6 +13,7 @@ #include #include #include +#include /* This symbol is provided by the linker - let it fill in the paca * field correctly */ @@ -87,6 +88,8 @@ void __init initialise_pacas(void) #ifdef CONFIG_PPC_BOOK3S new_paca->lppaca_ptr = &lppaca[cpu]; +#else + new_paca->kernel_pgd = swapper_pg_dir; #endif new_paca->lock_token = 0x8000; new_paca->paca_index = cpu; -- cgit v1.2.3-70-g09d2 From 25d21ad6e799cccd097b9df2a2fefe19a7e1dfcf Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:47 +0000 Subject: powerpc: Add TLB management code for 64-bit Book3E This adds the TLB miss handler assembly, the low level TLB flush routines along with the necessary hook for dealing with our virtual page tables or indirect TLB entries that need to be flushes when PTE pages are freed. There is currently no support for hugetlbfs Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-40x.h | 3 + arch/powerpc/include/asm/mmu-44x.h | 6 + arch/powerpc/include/asm/mmu-8xx.h | 3 + arch/powerpc/include/asm/mmu-hash32.h | 6 + arch/powerpc/include/asm/mmu_context.h | 8 + arch/powerpc/kernel/setup_64.c | 4 + arch/powerpc/mm/mmu_decl.h | 14 +- arch/powerpc/mm/tlb_low_64e.S | 734 +++++++++++++++++++++++++++++++++ arch/powerpc/mm/tlb_nohash.c | 203 ++++++++- arch/powerpc/mm/tlb_nohash_low.S | 79 ++++ 10 files changed, 1055 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/mm/tlb_low_64e.S (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu-40x.h b/arch/powerpc/include/asm/mmu-40x.h index 776f415a36a..34916865eae 100644 --- a/arch/powerpc/include/asm/mmu-40x.h +++ b/arch/powerpc/include/asm/mmu-40x.h @@ -61,4 +61,7 @@ typedef struct { #endif /* !__ASSEMBLY__ */ +#define mmu_virtual_psize MMU_PAGE_4K +#define mmu_linear_psize MMU_PAGE_256M + #endif /* _ASM_POWERPC_MMU_40X_H_ */ diff --git a/arch/powerpc/include/asm/mmu-44x.h b/arch/powerpc/include/asm/mmu-44x.h index 3c86576bfef..0372669383a 100644 --- a/arch/powerpc/include/asm/mmu-44x.h +++ b/arch/powerpc/include/asm/mmu-44x.h @@ -79,16 +79,22 @@ typedef struct { #if (PAGE_SHIFT == 12) #define PPC44x_TLBE_SIZE PPC44x_TLB_4K +#define mmu_virtual_psize MMU_PAGE_4K #elif (PAGE_SHIFT == 14) #define PPC44x_TLBE_SIZE PPC44x_TLB_16K +#define mmu_virtual_psize MMU_PAGE_16K #elif (PAGE_SHIFT == 16) #define PPC44x_TLBE_SIZE PPC44x_TLB_64K +#define mmu_virtual_psize MMU_PAGE_64K #elif (PAGE_SHIFT == 18) #define PPC44x_TLBE_SIZE PPC44x_TLB_256K +#define mmu_virtual_psize MMU_PAGE_256K #else #error "Unsupported PAGE_SIZE" #endif +#define mmu_linear_psize MMU_PAGE_256M + #define PPC44x_PGD_OFF_SHIFT (32 - PGDIR_SHIFT + PGD_T_LOG2) #define PPC44x_PGD_OFF_MASK_BIT (PGDIR_SHIFT - PGD_T_LOG2) #define PPC44x_PTE_ADD_SHIFT (32 - PGDIR_SHIFT + PTE_SHIFT + PTE_T_LOG2) diff --git a/arch/powerpc/include/asm/mmu-8xx.h b/arch/powerpc/include/asm/mmu-8xx.h index 07865a35784..3d11d3ce79e 100644 --- a/arch/powerpc/include/asm/mmu-8xx.h +++ b/arch/powerpc/include/asm/mmu-8xx.h @@ -143,4 +143,7 @@ typedef struct { } mm_context_t; #endif /* !__ASSEMBLY__ */ +#define mmu_virtual_psize MMU_PAGE_4K +#define mmu_linear_psize MMU_PAGE_8M + #endif /* _ASM_POWERPC_MMU_8XX_H_ */ diff --git a/arch/powerpc/include/asm/mmu-hash32.h b/arch/powerpc/include/asm/mmu-hash32.h index 16b1a1e77e6..382fc689f20 100644 --- a/arch/powerpc/include/asm/mmu-hash32.h +++ b/arch/powerpc/include/asm/mmu-hash32.h @@ -80,4 +80,10 @@ typedef struct { #endif /* !__ASSEMBLY__ */ +/* We happily ignore the smaller BATs on 601, we don't actually use + * those definitions on hash32 at the moment anyway + */ +#define mmu_virtual_psize MMU_PAGE_4K +#define mmu_linear_psize MMU_PAGE_256M + #endif /* _ASM_POWERPC_MMU_HASH32_H_ */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 8dffed31701..b34e94d9443 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -43,6 +43,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, tsk->thread.pgdir = next->pgd; #endif /* CONFIG_PPC32 */ + /* 64-bit Book3E keeps track of current PGD in the PACA */ +#ifdef CONFIG_PPC_BOOK3E_64 + get_paca()->pgd = next->pgd; +#endif /* Nothing else to do if we aren't actually switching */ if (prev == next) return; @@ -89,6 +93,10 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { + /* 64-bit Book3E keeps track of current PGD in the PACA */ +#ifdef CONFIG_PPC_BOOK3E_64 + get_paca()->pgd = NULL; +#endif } #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a6b6c4c9ae4..65aced7b833 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "setup.h" @@ -147,6 +148,9 @@ void __init setup_paca(int cpu) { local_paca = &paca[cpu]; mtspr(SPRN_SPRG_PACA, local_paca); +#ifdef CONFIG_PPC_BOOK3E + mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb); +#endif } /* diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 3871dceee2d..5961c6b739d 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -41,7 +41,11 @@ static inline void _tlbil_pid(unsigned int pid) #else /* CONFIG_40x || CONFIG_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbil_pid_noind(unsigned int pid); +#else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) +#endif #endif /* !(CONFIG_40x || CONFIG_8xx) */ /* @@ -53,7 +57,10 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); } -#else /* CONFIG_8xx */ +#elif defined(CONFIG_PPC_BOOK3E) +extern void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else extern void __tlbil_va(unsigned long address, unsigned int pid); static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) @@ -67,11 +74,16 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, * implementation. When that becomes the case, this will be * an extern. */ +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { BUG(); } +#endif #else /* CONFIG_PPC_MMU_NOHASH */ diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S new file mode 100644 index 00000000000..10d524ded7b --- /dev/null +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -0,0 +1,734 @@ +/* + * Low leve TLB miss handlers for Book3E + * + * Copyright (C) 2008-2009 + * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PPC_64K_PAGES +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) +#else +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) +#endif +#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) +#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) +#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) + + +/********************************************************************** + * * + * TLB miss handling for Book3E with TLB reservation and HES support * + * * + **********************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* The page tables are mapped virtually linear. At this point, though, + * we don't know whether we are trying to fault in a first level + * virtual address or a virtual page table address. We can get that + * from bit 0x1 of the region ID which we have set for a page table + */ + andi. r10,r15,0x1 + bne- virt_page_table_tlb_miss + + std r14,EX_TLB_ESR(r12); /* save ESR */ + std r16,EX_TLB_DEAR(r12); /* save DEAR */ + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + li r11,_PAGE_PRESENT + oris r11,r11,_PAGE_ACCESSED@h + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r15,0 /* Check for user region */ + + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + rlwimi r11,r14,32-19,27,27 + rlwimi r11,r14,32-16,19,19 + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by writing a crazy value in ESR in our exception frame + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + li r11,_PAGE_PRESENT|_PAGE_HWEXEC /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + + cmpldi cr0,r15,0 /* Check for user region */ + std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of the first-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = faulting address + * r15 = region ID + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ +normal_tlb_miss: + /* So we first construct the page table address. We do that by + * shifting the bottom of the address (not the region ID) by + * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and + * or'ing the fourth high bit. + * + * NOTE: For 64K pages, we do things slightly differently in + * order to handle the weird page table format used by linux + */ + ori r10,r15,0x1 +#ifdef CONFIG_PPC_64K_PAGES + /* For the top bits, 16 bytes per PTE */ + rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 + /* Now create the bottom bits as 0 in position 0x8000 and + * the rest calculated for 8 bytes per PTE + */ + rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 + /* Insert the bottom bits in */ + rlwimi r14,r15,0,16,31 +#else + rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 +#endif + sldi r15,r10,60 + clrrdi r14,r14,3 + or r10,r15,r14 + + /* Set the TLB reservation and seach for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,r16) + ld r14,0(r10) + beq normal_tlb_miss_done + +finish_normal_tlb_miss: + /* Check if required permissions are met */ + andc. r15,r11,r14 + bne- normal_tlb_miss_access_fault + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + * + * TODO: mix up code below for better scheduling + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r11 + + /* Check page size, if not standard, update MAS1 */ + rldicl r11,r14,64-8,64-8 +#ifdef CONFIG_PPC_64K_PAGES + cmpldi cr0,r11,BOOK3E_PAGESZ_64K +#else + cmpldi cr0,r11,BOOK3E_PAGESZ_4K +#endif + beq- 1f + mfspr r11,SPRN_MAS1 + rlwimi r11,r14,31,21,24 + rlwinm r11,r11,0,21,19 + mtspr SPRN_MAS1,r11 +1: + /* Move RPN in position */ + rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + clrldi r15,r11,12 /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + andi. r11,r14,_PAGE_DIRTY + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: mtspr SPRN_MAS7_MAS3,r15 + + tlbwe + +normal_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +normal_tlb_miss_access_fault: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_HWEXEC + bne 1f + ld r14,EX_TLB_DEAR(r12) + ld r15,EX_TLB_ESR(r12) + mtspr SPRN_DEAR,r14 + mtspr SPRN_ESR,r15 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = region (top 4 bits of address) + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * Note that this should only ever be called as a second level handler + * with the current scheme when using SW load. + * That means we can always get the original fault DEAR at + * EX_TLB_DEAR-EX_TLB_SIZE(r12) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will restart the whole fault at level + * 0 so we don't care too much about clobbers + * + * XXX That code was written back when we couldn't clobber r14. We can now, + * so we could probably optimize things a bit + */ +virt_page_table_tlb_miss: + /* Are we hitting a kernel page table ? */ + andi. r10,r15,0x8 + + /* The cool thing now is that r10 contains 0 for user and 8 for kernel, + * and we happen to have the swapper_pg_dir at offset 8 from the user + * pgdir in the PACA :-). + */ + add r11,r10,r13 + + /* If kernel, we need to clear MAS1 TID */ + beq 1f + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 +1: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + */ + PPC_TLBSRX_DOT(0,r16) + beq virt_page_table_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + bne- virt_page_table_tlb_miss_fault + + /* Get the PGD pointer */ + ld r15,PACAPGD(r11) + cmpldi cr0,r15,0 + beq- virt_page_table_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault + + /* Ok, we're all right, we can now create a kernel translation for + * a 4K or 64K page from r16 -> r15. + */ + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + * + * So we only do MAS 2 and 3 for now... + */ + clrldi r11,r15,4 /* remove region ID from RPN */ + ori r10,r11,1 /* Or-in SR */ + mtspr SPRN_MAS7_MAS3,r10 + + tlbwe + +virt_page_table_tlb_miss_done: + + /* We have overriden MAS2:EPN but currently our primary TLB miss + * handler will always restore it so that should not be an issue, + * if we ever optimize the primary handler to not write MAS2 on + * some cases, we'll have to restore MAS2:EPN here based on the + * original fault's DEAR. If we do that we have to modify the + * ITLB miss handler to also store SRR0 in the exception frame + * as DEAR. + * + * However, one nasty thing we did is we cleared the reservation + * (well, potentially we did). We do a trick here thus if we + * are not a level 0 exception (we interrupted the TLB miss) we + * offset the return address by -4 in order to replay the tlbsrx + * instruction there + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- 1f + ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) + addi r10,r11,-4 + std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) +1: + /* Return to caller, normal case */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); + TLB_MISS_EPILOG_SUCCESS + rfi + +virt_page_table_tlb_miss_fault: + /* If we fault here, things are a little bit tricky. We need to call + * either data or instruction store fault, and we need to retreive + * the original fault address and ESR (for data). + * + * The thing is, we know that in normal circumstances, this is + * always called as a second level tlb miss for SW load or as a first + * level TLB miss for HW load, so we should be able to peek at the + * relevant informations in the first exception frame in the PACA. + * + * However, we do need to double check that, because we may just hit + * a stray kernel pointer or a userland attack trying to hit those + * areas. If that is the case, we do a data fault. (We can't get here + * from an instruction tlb miss anyway). + * + * Note also that when going to a fault, we must unwind the previous + * level as well. Since we are doing that, we don't need to clear or + * restore the TLB reservation neither. + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- virt_page_table_tlb_miss_whacko_fault + + /* We dig the original DEAR and ESR from slot 0 */ + ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) + ld r16,EX_TLB_ESR+PACA_EXTLB(r13) + + /* We check for the "special" ESR value for instruction faults */ + cmpdi cr0,r16,-1 + beq 1f + mtspr SPRN_DEAR,r15 + mtspr SPRN_ESR,r16 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +virt_page_table_tlb_miss_whacko_fault: + /* The linear fault will restart everything so ESR and DEAR will + * not have been clobbered, let's just fault with what we have + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + + +/************************************************************** + * * + * TLB miss handling for Book3E with hw page table support * + * * + **************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_htw) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_htw) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by keeping a crazy value for ESR in r14 + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = PGD pointer + * r14 = ESR + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will save/restore things for us + */ +htw_tlb_miss: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + * + * MAS1:IND should be already set based on MAS4 + */ + PPC_TLBSRX_DOT(0,r16) + beq htw_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- htw_tlb_miss_fault + + /* Get the PGD pointer */ + cmpldi cr0,r15,0 + beq- htw_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault + + /* Ok, we're all right, we can now create an indirect entry for + * a 1M or 256M page. + * + * The last trick is now that because we use "half" pages for + * the HTW (1M IND is 2K and 256M IND is 32K) we need to account + * for an added LSB bit to the RPN. For 64K pages, there is no + * problem as we already use 32K arrays (half PTE pages), but for + * 4K page we need to extract a bit from the virtual address and + * insert it into the "PA52" bit of the RPN. + */ +#ifndef CONFIG_PPC_64K_PAGES + rlwimi r15,r16,32-9,20,20 +#endif + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + */ +#ifdef CONFIG_PPC_64K_PAGES + ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) +#else + ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) +#endif + mtspr SPRN_MAS7_MAS3,r10 + + tlbwe + +htw_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +htw_tlb_miss_fault: + /* We need to check if it was an instruction miss. We know this + * though because r14 would contain -1 + */ + cmpdi cr0,r14,-1 + beq 1f + mtspr SPRN_DEAR,r16 + mtspr SPRN_ESR,r14 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of "any" level TLB miss handler for kernel linear + * mapping misses. We are entered with: + * + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = ESR (data) or -1 (instruction) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * In addition we know that we will not re-enter, so in theory, we could + * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. + * + * We also need to be careful about MAS registers here & TLB reservation, + * as we know we'll have clobbered them if we interrupt the main TLB miss + * handlers in which case we probably want to do a full restart at level + * 0 rather than saving / restoring the MAS. + * + * Note: If we care about performance of that core, we can easily shuffle + * a few things around + */ +tlb_load_linear: + /* For now, we assume the linear mapping is contiguous and stops at + * linear_map_top. We also assume the size is a multiple of 1G, thus + * we only use 1G pages for now. That might have to be changed in a + * final implementation, especially when dealing with hypervisors + */ + ld r11,PACATOC(r13) + ld r11,linear_map_top@got(r11) + ld r10,0(r11) + cmpld cr0,r10,r16 + bge tlb_load_linear_fault + + /* MAS1 need whole new setup. */ + li r15,(BOOK3E_PAGESZ_1GB< - * IBM Corp. + * Copyright 2008,2009 Ben Herrenschmidt + * IBM Corp. * * Derived from arch/ppc/mm/init.c: * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -34,12 +34,70 @@ #include #include #include +#include #include #include +#include #include "mmu_decl.h" +#ifdef CONFIG_PPC_BOOK3E +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_16K] = { + .shift = 14, + .enc = BOOK3E_PAGESZ_16K, + }, + [MMU_PAGE_64K] = { + .shift = 16, + .enc = BOOK3E_PAGESZ_64K, + }, + [MMU_PAGE_1M] = { + .shift = 20, + .enc = BOOK3E_PAGESZ_1M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +#ifdef CONFIG_PPC64 + +int mmu_linear_psize; /* Page size used for the linear mapping */ +int mmu_pte_psize; /* Page size used for PTE pages */ +int book3e_htw_enabled; /* Is HW tablewalk enabled ? */ +unsigned long linear_map_top; /* Top of linear mapping */ + +#endif /* CONFIG_PPC64 */ + /* * Base TLB flushing operations: * @@ -82,7 +140,7 @@ void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - 0 /* tsize unused for now */, 0); + mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(local_flush_tlb_page); @@ -198,7 +256,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - 0 /* tsize unused for now */, 0); + mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(flush_tlb_page); @@ -241,3 +299,140 @@ void tlb_flush(struct mmu_gather *tlb) /* Push out batch of freed page tables */ pte_free_finish(); } + +/* + * Below are functions specific to the 64-bit variant of Book3E though that + * may change in the future + */ + +#ifdef CONFIG_PPC64 + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].enc; + + if (book3e_htw_enabled) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + +#ifdef CONFIG_PPC_64K_PAGES + vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; +#else + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; +#endif + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +/* + * Early initialization of the MMU TLB code + */ +static void __early_init_mmu(int boot_cpu) +{ + extern unsigned int interrupt_base_book3e; + extern unsigned int exc_data_tlb_miss_htw_book3e; + extern unsigned int exc_instruction_tlb_miss_htw_book3e; + + unsigned int *ibase = &interrupt_base_book3e; + unsigned int mas4; + + /* XXX This will have to be decided at runtime, but right + * now our boot and TLB miss code hard wires it + */ + mmu_linear_psize = MMU_PAGE_1G; + + + /* Check if HW tablewalk is present, and if yes, enable it by: + * + * - patching the TLB miss handlers to branch to the + * one dedicates to it + * + * - setting the global book3e_htw_enabled + * + * - Set MAS4:INDD and default page size + */ + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + if (boot_cpu) { + unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG); + + /* Check if HW loader is supported */ + if ((tlb0cfg & TLBnCFG_IND) && + (tlb0cfg & TLBnCFG_PT)) { + patch_branch(ibase + (0x1c0 / 4), + (unsigned long)&exc_data_tlb_miss_htw_book3e, 0); + patch_branch(ibase + (0x1e0 / 4), + (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0); + book3e_htw_enabled = 1; + } + pr_info("MMU: Book3E Page Tables %s\n", + book3e_htw_enabled ? "Enabled" : "Disabled"); + } + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + if (book3e_htw_enabled) { + mas4 |= mas4 | MAS4_INDD; +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_256M; +#else + mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_1M; +#endif + } else { +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; +#else + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; +#endif + mmu_pte_psize = mmu_virtual_psize; + } + mtspr(SPRN_MAS4, mas4); + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = lmb_end_of_DRAM(); + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +void __init early_init_mmu(void) +{ + __early_init_mmu(1); +} + +void __cpuinit early_init_mmu_secondary(void) +{ + __early_init_mmu(0); +} + +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index c7d89a0adba..7bcd9fbf6cc 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -191,6 +191,85 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr +#elif defined(CONFIG_PPC_BOOK3E) +/* + * New Book3E (>= 2.06) implementation + * + * Note: We may be able to get away without the interrupt masking stuff + * if we save/restore MAS6 on exceptions that might modify it + */ +_GLOBAL(_tlbil_pid) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_pid_noind) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + ori r4,r4,MAS6_SIND + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_all) + PPC_TLBILX_ALL(0,0) + msync + isync + blr + +_GLOBAL(_tlbil_va) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBILX_VA(0,r3) + msync + isync + wrtee r10 + blr + +_GLOBAL(_tlbivax_bcast) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBIVAX(0,r3) + eieio + tlbsync + sync + wrtee r10 + blr + +_GLOBAL(set_context) +#ifdef CONFIG_BDI_SWITCH + /* Context switch the PTE pointer for the Abatron BDI2000. + * The PGDIR is the second parameter. + */ + lis r5, abatron_pteptrs@h + ori r5, r5, abatron_pteptrs@l + stw r4, 0x4(r5) +#endif + mtspr SPRN_PID,r3 + isync /* Force context change */ + blr #else #error Unsupported processor type ! #endif -- cgit v1.2.3-70-g09d2 From 32a74949b7337726e76d69f51c48715431126c6c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:58 +0000 Subject: powerpc/mm: Add support for SPARSEMEM_VMEMMAP on 64-bit Book3E The base TLB support didn't include support for SPARSEMEM_VMEMMAP, though we did carve out some virtual space for it, the necessary support code wasn't there. This implements it by using 16M pages for now, though the page size could easily be changed at runtime if necessary. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-book3e.h | 1 + arch/powerpc/include/asm/pgtable-ppc64.h | 3 +- arch/powerpc/mm/init_64.c | 55 ++++++++++++++++++++++++++++---- arch/powerpc/mm/mmu_decl.h | 7 +++- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/mm/tlb_nohash.c | 11 ++++++- 6 files changed, 68 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 6ddbe48d07f..d7458046936 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -196,6 +196,7 @@ extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; #endif extern int mmu_linear_psize; +extern int mmu_vmemmap_psize; #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 7254c5a3187..200ec2dfa03 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -46,6 +46,7 @@ /* * The vmalloc space starts at the beginning of that region, and * occupies half of it on hash CPUs and a quarter of it on Book3E + * (we keep a quarter for the virtual memmap) */ #define VMALLOC_START KERN_VIRT_START #ifdef CONFIG_PPC_BOOK3E @@ -83,7 +84,7 @@ #define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) #define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) -#define VMEMMAP_REGION_ID (0xfUL) +#define VMEMMAP_REGION_ID (0xfUL) /* Server only */ #define USER_REGION_ID (0UL) /* diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 68a821add28..31582329cd6 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -205,6 +205,47 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } +/* On hash-based CPUs, the vmemmap is bolted in the hash table. + * + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ + +#ifdef CONFIG_PPC_BOOK3E +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); +} +#else /* CONFIG_PPC_BOOK3E */ +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int mapped = htab_bolt_mapping(start, start + page_size, phys, + PAGE_KERNEL, mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} +#endif /* CONFIG_PPC_BOOK3E */ + int __meminit vmemmap_populate(struct page *start_page, unsigned long nr_pages, int node) { @@ -215,8 +256,11 @@ int __meminit vmemmap_populate(struct page *start_page, /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); + pr_debug("vmemmap_populate page %p, %ld pages, node %d\n", + start_page, nr_pages, node); + pr_debug(" -> map %lx..%lx\n", start, end); + for (; start < end; start += page_size) { - int mapped; void *p; if (vmemmap_populated(start, page_size)) @@ -226,13 +270,10 @@ int __meminit vmemmap_populate(struct page *start_page, if (!p) return -ENOMEM; - pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n", - start, p, __pa(p)); + pr_debug(" * %016lx..%016lx allocated at %p\n", + start, start + page_size, p); - mapped = htab_bolt_mapping(start, start + page_size, __pa(p), - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - BUG_ON(mapped < 0); + vmemmap_create_mapping(start, page_size, __pa(p)); } return 0; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 5961c6b739d..d2e5321d5ea 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -121,7 +121,12 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; -#endif + +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 +extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags); +#endif /* CONFIG_PPC64 */ extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 93ed1a3c872..853d5565eed 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -79,7 +79,7 @@ static void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -static int map_kernel_page(unsigned long ea, unsigned long pa, int flags) +int map_kernel_page(unsigned long ea, unsigned long pa, int flags) { pgd_t *pgdp; pud_t *pudp; diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d16100c9416..2fbc680c2c7 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -93,6 +93,7 @@ static inline int mmu_get_tsize(int psize) int mmu_linear_psize; /* Page size used for the linear mapping */ int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ int book3e_htw_enabled; /* Is HW tablewalk enabled ? */ unsigned long linear_map_top; /* Top of linear mapping */ @@ -356,10 +357,18 @@ static void __early_init_mmu(int boot_cpu) unsigned int mas4; /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it + * now our boot and TLB miss code hard wires it. Ideally + * we should find out a suitable page size and patch the + * TLB miss code (either that or use the PACA to store + * the value we want) */ mmu_linear_psize = MMU_PAGE_1G; + /* XXX This should be decided at runtime based on supported + * page sizes in the TLB, but for now let's assume 16M is + * always there and a good fit (which it probably is) + */ + mmu_vmemmap_psize = MMU_PAGE_16M; /* Check if HW tablewalk is present, and if yes, enable it by: * -- cgit v1.2.3-70-g09d2 From 2d27cfd3286966c04d4192a9db5a6c7ea60eebf1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:59 +0000 Subject: powerpc: Remaining 64-bit Book3E support This contains all the bits that didn't fit in previous patches :-) This includes the actual exception handlers assembly, the changes to the kernel entry, other misc bits and wiring it all up in Kconfig. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/hw_irq.h | 5 + arch/powerpc/include/asm/smp.h | 1 + arch/powerpc/kernel/Makefile | 10 +- arch/powerpc/kernel/cputable.c | 27 +- arch/powerpc/kernel/entry_64.S | 60 ++- arch/powerpc/kernel/exceptions-64e.S | 784 +++++++++++++++++++++++++++++++++ arch/powerpc/kernel/head_64.S | 68 ++- arch/powerpc/kernel/setup_64.c | 19 + arch/powerpc/mm/Makefile | 1 + arch/powerpc/platforms/Kconfig.cputype | 38 +- arch/powerpc/xmon/xmon.c | 2 +- 12 files changed, 993 insertions(+), 24 deletions(-) create mode 100644 arch/powerpc/kernel/exceptions-64e.S (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 52349ef1b3a..4c0747e8ed7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -472,7 +472,7 @@ config PPC_16K_PAGES bool "16k page size" if 44x config PPC_64K_PAGES - bool "64k page size" if 44x || PPC_STD_MMU_64 + bool "64k page size" if 44x || PPC_STD_MMU_64 || PPC_BOOK3E_64 select PPC_HAS_HASH_64K if PPC_STD_MMU_64 config PPC_256K_PAGES diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 8b505eaaa38..e73d554538d 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -49,8 +49,13 @@ extern void iseries_handle_interrupts(void); #define raw_irqs_disabled() (local_get_flags() == 0) #define raw_irqs_disabled_flags(flags) ((flags) == 0) +#ifdef CONFIG_PPC_BOOK3E +#define __hard_irq_enable() __asm__ __volatile__("wrteei 1": : :"memory"); +#define __hard_irq_disable() __asm__ __volatile__("wrteei 0": : :"memory"); +#else #define __hard_irq_enable() __mtmsrd(mfmsr() | MSR_EE, 1) #define __hard_irq_disable() __mtmsrd(mfmsr() & ~MSR_EE, 1) +#endif #define hard_irq_disable() \ do { \ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index e782f43ee66..c0d3b8af931 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -153,6 +153,7 @@ extern void arch_send_call_function_ipi(cpumask_t mask); * 64-bit but defining them all here doesn't harm */ extern void generic_secondary_smp_init(void); +extern void generic_secondary_thread_init(void); extern unsigned long __secondary_hold_spinloop; extern unsigned long __secondary_hold_acknowledge; extern char __secondary_hold; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b73396b9390..035946f9d5f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -33,10 +33,10 @@ obj-y := cputable.o ptrace.o syscalls.o \ obj-y += vdso32/ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ - paca.o cpu_setup_ppc970.o \ - cpu_setup_pa6t.o \ - firmware.o nvram_64.o + paca.o nvram_64.o firmware.o +obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj64-$(CONFIG_RELOCATABLE) += reloc_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o obj-$(CONFIG_PPC_970_NAP) += idle_power4.o @@ -63,8 +63,8 @@ obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_44x) += cpu_setup_44x.o obj-$(CONFIG_FSL_BOOKE) += cpu_setup_fsl_booke.o dbell.o -extra-$(CONFIG_PPC_STD_MMU) := head_32.o -extra-$(CONFIG_PPC64) := head_64.o +extra-y := head_$(CONFIG_WORD_SIZE).o +extra-$(CONFIG_PPC_BOOK3E_32) := head_new_booke.o extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 4a24a2fc457..f34ea37079b 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -93,7 +93,7 @@ extern void __restore_cpu_power7(void); PPC_FEATURE_BOOKE) static struct cpu_spec __initdata cpu_specs[] = { -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 { /* Power3 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00400000, @@ -508,7 +508,30 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "power4", } -#endif /* CONFIG_PPC64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_PPC_BOOK3E_64 + { /* This is a default entry to get going, to be replaced by + * a real one at some stage + */ +#define CPU_FTRS_BASE_BOOK3E (CPU_FTR_USE_TB | \ + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_SMT | \ + CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE) + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "Book3E", + .cpu_features = CPU_FTRS_BASE_BOOK3E, + .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTR_TYPE_3E | MMU_FTR_USE_TLBILX | + MMU_FTR_USE_TLBIVAX_BCAST | + MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 0, + .machine_check = machine_check_generic, + .platform = "power6", + }, +#endif + #ifdef CONFIG_PPC32 #if CLASSIC_PPC { /* 601 */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 1cb0f3d1714..66bcda34a6b 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -120,9 +120,15 @@ BEGIN_FW_FTR_SECTION 2: END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) #endif /* CONFIG_PPC_ISERIES */ + + /* Hard enable interrupts */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else mfmsr r11 ori r11,r11,MSR_EE mtmsrd r11,1 +#endif /* CONFIG_PPC_BOOK3E */ #ifdef SHOW_SYSCALLS bl .do_show_syscall @@ -168,15 +174,25 @@ syscall_exit: #endif clrrdi r12,r1,THREAD_SHIFT - /* disable interrupts so current_thread_info()->flags can't change, - and so that we don't get interrupted after loading SRR0/1. */ ld r8,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3S + /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI beq- unrecov_restore +#endif + + /* Disable interrupts so current_thread_info()->flags can't change, + * and so that we don't get interrupted after loading SRR0/1. + */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else mfmsr r10 rldicl r10,r10,48,1 rotldi r10,r10,16 mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ + ld r9,TI_FLAGS(r12) li r11,-_LAST_ERRNO andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) @@ -194,9 +210,13 @@ syscall_error_cont: * userspace and we take an exception after restoring r13, * we end up corrupting the userspace r13 value. */ +#ifdef CONFIG_PPC_BOOK3S + /* No MSR:RI on BookE */ li r12,MSR_RI andc r11,r10,r12 mtmsrd r11,1 /* clear MSR.RI */ +#endif /* CONFIG_PPC_BOOK3S */ + beq- 1f ACCOUNT_CPU_USER_EXIT(r11, r12) ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ @@ -206,7 +226,7 @@ syscall_error_cont: mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - rfid + RFI b . /* prevent speculative execution */ syscall_error: @@ -276,9 +296,13 @@ syscall_exit_work: beq .ret_from_except_lite /* Re-enable interrupts */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else mfmsr r10 ori r10,r10,MSR_EE mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD @@ -380,7 +404,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) and. r0,r0,r22 beq+ 1f andc r22,r22,r0 - mtmsrd r22 + MTMSRD(r22) isync 1: std r20,_NIP(r1) mfcr r23 @@ -399,6 +423,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ +#ifdef CONFIG_PPC_BOOK3S BEGIN_FTR_SECTION BEGIN_FTR_SECTION_NESTED(95) clrrdi r6,r8,28 /* get its ESID */ @@ -445,8 +470,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) slbie r6 /* Workaround POWER5 < DD2.1 issue */ slbmte r7,r0 isync - 2: +#endif /* !CONFIG_PPC_BOOK3S */ + clrrdi r7,r8,THREAD_SHIFT /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE because we don't need to leave the 288-byte ABI gap at the @@ -490,10 +516,14 @@ _GLOBAL(ret_from_except_lite) * can't change between when we test it and when we return * from the interrupt. */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else mfmsr r10 /* Get current interrupt state */ rldicl r9,r10,48,1 /* clear MSR_EE */ rotldi r9,r9,16 mtmsrd r9,1 /* Update machine state */ +#endif /* CONFIG_PPC_BOOK3E */ #ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ @@ -540,6 +570,9 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ stb r4,PACAHARDIRQEN(r13) +#ifdef CONFIG_PPC_BOOK3E + b .exception_return_book3e +#else ld r4,_CTR(r1) ld r0,_LINK(r1) mtctr r4 @@ -588,6 +621,8 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) rfid b . /* prevent speculative execution */ +#endif /* CONFIG_PPC_BOOK3E */ + iseries_check_pending_irqs: #ifdef CONFIG_PPC_ISERIES ld r5,SOFTE(r1) @@ -638,6 +673,11 @@ do_work: li r0,1 stb r0,PACASOFTIRQEN(r13) stb r0,PACAHARDIRQEN(r13) +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 + bl .preempt_schedule + wrteei 0 +#else ori r10,r10,MSR_EE mtmsrd r10,1 /* reenable interrupts */ bl .preempt_schedule @@ -646,6 +686,7 @@ do_work: rldicl r10,r10,48,1 /* disable interrupts again */ rotldi r10,r10,16 mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b @@ -654,8 +695,12 @@ do_work: user_work: #endif /* Enable interrupts */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else ori r10,r10,MSR_EE mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ andi. r0,r4,_TIF_NEED_RESCHED beq 1f @@ -837,6 +882,10 @@ _GLOBAL(enter_prom) /* Switch MSR to 32 bits mode */ +#ifdef CONFIG_PPC_BOOK3E + rlwinm r11,r11,0,1,31 + mtmsr r11 +#else /* CONFIG_PPC_BOOK3E */ mfmsr r11 li r12,1 rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) @@ -845,6 +894,7 @@ _GLOBAL(enter_prom) rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) andc r11,r11,r12 mtmsrd r11 +#endif /* CONFIG_PPC_BOOK3E */ isync /* Enter PROM here... */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S new file mode 100644 index 00000000000..695d4847d22 --- /dev/null +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -0,0 +1,784 @@ +/* + * Boot code and exception vectors for Book3E processors + * + * Copyright (C) 2007 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* XXX This will ultimately add space for a special exception save + * structure used to save things like SRR0/SRR1, SPRGs, MAS, etc... + * when taking special interrupts. For now we don't support that, + * special interrupts from within a non-standard level will probably + * blow you up + */ +#define SPECIAL_EXC_FRAME_SIZE INT_FRAME_SIZE + +/* Exception prolog code for all exceptions */ +#define EXCEPTION_PROLOG(n, type, addition) \ + mtspr SPRN_SPRG_##type##_SCRATCH,r13; /* get spare registers */ \ + mfspr r13,SPRN_SPRG_PACA; /* get PACA */ \ + std r10,PACA_EX##type+EX_R10(r13); \ + std r11,PACA_EX##type+EX_R11(r13); \ + mfcr r10; /* save CR */ \ + addition; /* additional code for that exc. */ \ + std r1,PACA_EX##type+EX_R1(r13); /* save old r1 in the PACA */ \ + stw r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \ + mfspr r11,SPRN_##type##_SRR1;/* what are we coming from */ \ + type##_SET_KSTACK; /* get special stack if necessary */\ + andi. r10,r11,MSR_PR; /* save stack pointer */ \ + beq 1f; /* branch around if supervisor */ \ + ld r1,PACAKSAVE(r13); /* get kernel stack coming from usr */\ +1: cmpdi cr1,r1,0; /* check if SP makes sense */ \ + bge- cr1,exc_##n##_bad_stack;/* bad stack (TODO: out of line) */ \ + mfspr r10,SPRN_##type##_SRR0; /* read SRR0 before touching stack */ + +/* Exception type-specific macros */ +#define GEN_SET_KSTACK \ + subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ +#define SPRN_GEN_SRR0 SPRN_SRR0 +#define SPRN_GEN_SRR1 SPRN_SRR1 + +#define CRIT_SET_KSTACK \ + ld r1,PACA_CRIT_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE; +#define SPRN_CRIT_SRR0 SPRN_CSRR0 +#define SPRN_CRIT_SRR1 SPRN_CSRR1 + +#define DBG_SET_KSTACK \ + ld r1,PACA_DBG_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE; +#define SPRN_DBG_SRR0 SPRN_DSRR0 +#define SPRN_DBG_SRR1 SPRN_DSRR1 + +#define MC_SET_KSTACK \ + ld r1,PACA_MC_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE; +#define SPRN_MC_SRR0 SPRN_MCSRR0 +#define SPRN_MC_SRR1 SPRN_MCSRR1 + +#define NORMAL_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, GEN, addition##_GEN) + +#define CRIT_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, CRIT, addition##_CRIT) + +#define DBG_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, DBG, addition##_DBG) + +#define MC_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, MC, addition##_MC) + + +/* Variants of the "addition" argument for the prolog + */ +#define PROLOG_ADDITION_NONE_GEN +#define PROLOG_ADDITION_NONE_CRIT +#define PROLOG_ADDITION_NONE_DBG +#define PROLOG_ADDITION_NONE_MC + +#define PROLOG_ADDITION_MASKABLE_GEN \ + lbz r11,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */ \ + cmpwi cr0,r11,0; /* yes -> go out of line */ \ + beq masked_interrupt_book3e; + +#define PROLOG_ADDITION_2REGS_GEN \ + std r14,PACA_EXGEN+EX_R14(r13); \ + std r15,PACA_EXGEN+EX_R15(r13) + +#define PROLOG_ADDITION_1REG_GEN \ + std r14,PACA_EXGEN+EX_R14(r13); + +#define PROLOG_ADDITION_2REGS_CRIT \ + std r14,PACA_EXCRIT+EX_R14(r13); \ + std r15,PACA_EXCRIT+EX_R15(r13) + +#define PROLOG_ADDITION_2REGS_DBG \ + std r14,PACA_EXDBG+EX_R14(r13); \ + std r15,PACA_EXDBG+EX_R15(r13) + +#define PROLOG_ADDITION_2REGS_MC \ + std r14,PACA_EXMC+EX_R14(r13); \ + std r15,PACA_EXMC+EX_R15(r13) + +/* Core exception code for all exceptions except TLB misses. + * XXX: Needs to make SPRN_SPRG_GEN depend on exception type + */ +#define EXCEPTION_COMMON(n, excf, ints) \ + std r0,GPR0(r1); /* save r0 in stackframe */ \ + std r2,GPR2(r1); /* save r2 in stackframe */ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ + std r9,GPR9(r1); /* save r9 in stackframe */ \ + std r10,_NIP(r1); /* save SRR0 to stackframe */ \ + std r11,_MSR(r1); /* save SRR1 to stackframe */ \ + ACCOUNT_CPU_USER_ENTRY(r10,r11);/* accounting (uses cr0+eq) */ \ + ld r3,excf+EX_R10(r13); /* get back r10 */ \ + ld r4,excf+EX_R11(r13); /* get back r11 */ \ + mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 */ \ + std r12,GPR12(r1); /* save r12 in stackframe */ \ + ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ + mflr r6; /* save LR in stackframe */ \ + mfctr r7; /* save CTR in stackframe */ \ + mfspr r8,SPRN_XER; /* save XER in stackframe */ \ + ld r9,excf+EX_R1(r13); /* load orig r1 back from PACA */ \ + lwz r10,excf+EX_CR(r13); /* load orig CR back from PACA */ \ + lbz r11,PACASOFTIRQEN(r13); /* get current IRQ softe */ \ + ld r12,exception_marker@toc(r2); \ + li r0,0; \ + std r3,GPR10(r1); /* save r10 to stackframe */ \ + std r4,GPR11(r1); /* save r11 to stackframe */ \ + std r5,GPR13(r1); /* save it to stackframe */ \ + std r6,_LINK(r1); \ + std r7,_CTR(r1); \ + std r8,_XER(r1); \ + li r3,(n)+1; /* indicate partial regs in trap */ \ + std r9,0(r1); /* store stack frame back link */ \ + std r10,_CCR(r1); /* store orig CR in stackframe */ \ + std r9,GPR1(r1); /* store stack frame back link */ \ + std r11,SOFTE(r1); /* and save it to stackframe */ \ + std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ + std r3,_TRAP(r1); /* set trap number */ \ + std r0,RESULT(r1); /* clear regs->result */ \ + ints; + +/* Variants for the "ints" argument */ +#define INTS_KEEP +#define INTS_DISABLE_SOFT \ + stb r0,PACASOFTIRQEN(r13); /* mark interrupts soft-disabled */ \ + TRACE_DISABLE_INTS; +#define INTS_DISABLE_HARD \ + stb r0,PACAHARDIRQEN(r13); /* and hard disabled */ +#define INTS_DISABLE_ALL \ + INTS_DISABLE_SOFT \ + INTS_DISABLE_HARD + +/* This is called by exceptions that used INTS_KEEP (that is did not clear + * neither soft nor hard IRQ indicators in the PACA. This will restore MSR:EE + * to it's previous value + * + * XXX In the long run, we may want to open-code it in order to separate the + * load from the wrtee, thus limiting the latency caused by the dependency + * but at this point, I'll favor code clarity until we have a near to final + * implementation + */ +#define INTS_RESTORE_HARD \ + ld r11,_MSR(r1); \ + wrtee r11; + +/* XXX FIXME: Restore r14/r15 when necessary */ +#define BAD_STACK_TRAMPOLINE(n) \ +exc_##n##_bad_stack: \ + li r1,(n); /* get exception number */ \ + sth r1,PACA_TRAP_SAVE(r13); /* store trap */ \ + b bad_stack_book3e; /* bad stack error */ + +#define EXCEPTION_STUB(loc, label) \ + . = interrupt_base_book3e + loc; \ + nop; /* To make debug interrupts happy */ \ + b exc_##label##_book3e; + +#define ACK_NONE(r) +#define ACK_DEC(r) \ + lis r,TSR_DIS@h; \ + mtspr SPRN_TSR,r +#define ACK_FIT(r) \ + lis r,TSR_FIS@h; \ + mtspr SPRN_TSR,r + +#define MASKABLE_EXCEPTION(trapnum, label, hdlr, ack) \ + START_EXCEPTION(label); \ + NORMAL_EXCEPTION_PROLOG(trapnum, PROLOG_ADDITION_MASKABLE) \ + EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE_ALL) \ + ack(r8); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + bl hdlr; \ + b .ret_from_except_lite; + +/* This value is used to mark exception frames on the stack. */ + .section ".toc","aw" +exception_marker: + .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER + + +/* + * And here we have the exception vectors ! + */ + + .text + .balign 0x1000 + .globl interrupt_base_book3e +interrupt_base_book3e: /* fake trap */ + /* Note: If real debug exceptions are supported by the HW, the vector + * below will have to be patched up to point to an appropriate handler + */ + EXCEPTION_STUB(0x000, machine_check) /* 0x0200 */ + EXCEPTION_STUB(0x020, critical_input) /* 0x0580 */ + EXCEPTION_STUB(0x040, debug_crit) /* 0x0d00 */ + EXCEPTION_STUB(0x060, data_storage) /* 0x0300 */ + EXCEPTION_STUB(0x080, instruction_storage) /* 0x0400 */ + EXCEPTION_STUB(0x0a0, external_input) /* 0x0500 */ + EXCEPTION_STUB(0x0c0, alignment) /* 0x0600 */ + EXCEPTION_STUB(0x0e0, program) /* 0x0700 */ + EXCEPTION_STUB(0x100, fp_unavailable) /* 0x0800 */ + EXCEPTION_STUB(0x120, system_call) /* 0x0c00 */ + EXCEPTION_STUB(0x140, ap_unavailable) /* 0x0f20 */ + EXCEPTION_STUB(0x160, decrementer) /* 0x0900 */ + EXCEPTION_STUB(0x180, fixed_interval) /* 0x0980 */ + EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */ + EXCEPTION_STUB(0x1c0, data_tlb_miss) + EXCEPTION_STUB(0x1e0, instruction_tlb_miss) + +#if 0 + EXCEPTION_STUB(0x280, processor_doorbell) + EXCEPTION_STUB(0x220, processor_doorbell_crit) +#endif + .globl interrupt_end_book3e +interrupt_end_book3e: + +/* Critical Input Interrupt */ + START_EXCEPTION(critical_input); + CRIT_EXCEPTION_PROLOG(0x100, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE_ALL) +// bl special_reg_save_crit +// addi r3,r1,STACK_FRAME_OVERHEAD +// bl .critical_exception +// b ret_from_crit_except + b . + +/* Machine Check Interrupt */ + START_EXCEPTION(machine_check); + CRIT_EXCEPTION_PROLOG(0x200, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE_ALL) +// bl special_reg_save_mc +// addi r3,r1,STACK_FRAME_OVERHEAD +// bl .machine_check_exception +// b ret_from_mc_except + b . + +/* Data Storage Interrupt */ + START_EXCEPTION(data_storage) + NORMAL_EXCEPTION_PROLOG(0x300, PROLOG_ADDITION_2REGS) + mfspr r14,SPRN_DEAR + mfspr r15,SPRN_ESR + EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_KEEP) + b storage_fault_common + +/* Instruction Storage Interrupt */ + START_EXCEPTION(instruction_storage); + NORMAL_EXCEPTION_PROLOG(0x400, PROLOG_ADDITION_2REGS) + li r15,0 + mr r14,r10 + EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_KEEP) + b storage_fault_common + +/* External Input Interrupt */ + MASKABLE_EXCEPTION(0x500, external_input, .do_IRQ, ACK_NONE) + +/* Alignment */ + START_EXCEPTION(alignment); + NORMAL_EXCEPTION_PROLOG(0x600, PROLOG_ADDITION_2REGS) + mfspr r14,SPRN_DEAR + mfspr r15,SPRN_ESR + EXCEPTION_COMMON(0x600, PACA_EXGEN, INTS_KEEP) + b alignment_more /* no room, go out of line */ + +/* Program Interrupt */ + START_EXCEPTION(program); + NORMAL_EXCEPTION_PROLOG(0x700, PROLOG_ADDITION_1REG) + mfspr r14,SPRN_ESR + EXCEPTION_COMMON(0x700, PACA_EXGEN, INTS_DISABLE_SOFT) + std r14,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + ld r14,PACA_EXGEN+EX_R14(r13) + bl .save_nvgprs + INTS_RESTORE_HARD + bl .program_check_exception + b .ret_from_except + +/* Floating Point Unavailable Interrupt */ + START_EXCEPTION(fp_unavailable); + NORMAL_EXCEPTION_PROLOG(0x800, PROLOG_ADDITION_NONE) + /* we can probably do a shorter exception entry for that one... */ + EXCEPTION_COMMON(0x800, PACA_EXGEN, INTS_KEEP) + bne 1f /* if from user, just load it up */ + bl .save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + INTS_RESTORE_HARD + bl .kernel_fp_unavailable_exception + BUG_OPCODE +1: ld r12,_MSR(r1) + bl .load_up_fpu + b fast_exception_return + +/* Decrementer Interrupt */ + MASKABLE_EXCEPTION(0x900, decrementer, .timer_interrupt, ACK_DEC) + +/* Fixed Interval Timer Interrupt */ + MASKABLE_EXCEPTION(0x980, fixed_interval, .unknown_exception, ACK_FIT) + +/* Watchdog Timer Interrupt */ + START_EXCEPTION(watchdog); + CRIT_EXCEPTION_PROLOG(0x9f0, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE_ALL) +// bl special_reg_save_crit +// addi r3,r1,STACK_FRAME_OVERHEAD +// bl .unknown_exception +// b ret_from_crit_except + b . + +/* System Call Interrupt */ + START_EXCEPTION(system_call) + mr r9,r13 /* keep a copy of userland r13 */ + mfspr r11,SPRN_SRR0 /* get return address */ + mfspr r12,SPRN_SRR1 /* get previous MSR */ + mfspr r13,SPRN_SPRG_PACA /* get our PACA */ + b system_call_common + +/* Auxillary Processor Unavailable Interrupt */ + START_EXCEPTION(ap_unavailable); + NORMAL_EXCEPTION_PROLOG(0xf20, PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_KEEP) + addi r3,r1,STACK_FRAME_OVERHEAD + bl .save_nvgprs + INTS_RESTORE_HARD + bl .unknown_exception + b .ret_from_except + +/* Debug exception as a critical interrupt*/ + START_EXCEPTION(debug_crit); + CRIT_EXCEPTION_PROLOG(0xd00, PROLOG_ADDITION_2REGS) + + /* + * If there is a single step or branch-taken exception in an + * exception entry sequence, it was probably meant to apply to + * the code where the exception occurred (since exception entry + * doesn't turn off DE automatically). We simulate the effect + * of turning off DE on entry to an exception handler by turning + * off DE in the CSRR1 value and clearing the debug status. + */ + + mfspr r14,SPRN_DBSR /* check single-step/branch taken */ + andis. r15,r14,DBSR_IC@h + beq+ 1f + + LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) + LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e) + cmpld cr0,r10,r14 + cmpld cr1,r10,r15 + blt+ cr0,1f + bge+ cr1,1f + + /* here it looks like we got an inappropriate debug exception. */ + lis r14,DBSR_IC@h /* clear the IC event */ + rlwinm r11,r11,0,~MSR_DE /* clear DE in the CSRR1 value */ + mtspr SPRN_DBSR,r14 + mtspr SPRN_CSRR1,r11 + lwz r10,PACA_EXCRIT+EX_CR(r13) /* restore registers */ + ld r1,PACA_EXCRIT+EX_R1(r13) + ld r14,PACA_EXCRIT+EX_R14(r13) + ld r15,PACA_EXCRIT+EX_R15(r13) + mtcr r10 + ld r10,PACA_EXCRIT+EX_R10(r13) /* restore registers */ + ld r11,PACA_EXCRIT+EX_R11(r13) + mfspr r13,SPRN_SPRG_CRIT_SCRATCH + rfci + + /* Normal debug exception */ + /* XXX We only handle coming from userspace for now since we can't + * quite save properly an interrupted kernel state yet + */ +1: andi. r14,r11,MSR_PR; /* check for userspace again */ + beq kernel_dbg_exc; /* if from kernel mode */ + + /* Now we mash up things to make it look like we are coming on a + * normal exception + */ + mfspr r15,SPRN_SPRG_CRIT_SCRATCH + mtspr SPRN_SPRG_GEN_SCRATCH,r15 + mfspr r14,SPRN_DBSR + EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE_ALL) + std r14,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r14 + ld r14,PACA_EXCRIT+EX_R14(r13) + ld r15,PACA_EXCRIT+EX_R15(r13) + bl .save_nvgprs + bl .DebugException + b .ret_from_except + +kernel_dbg_exc: + b . /* NYI */ + + +/* + * An interrupt came in while soft-disabled; clear EE in SRR1, + * clear paca->hard_enabled and return. + */ +masked_interrupt_book3e: + mtcr r10 + stb r11,PACAHARDIRQEN(r13) + mfspr r10,SPRN_SRR1 + rldicl r11,r10,48,1 /* clear MSR_EE */ + rotldi r10,r11,16 + mtspr SPRN_SRR1,r10 + ld r10,PACA_EXGEN+EX_R10(r13); /* restore registers */ + ld r11,PACA_EXGEN+EX_R11(r13); + mfspr r13,SPRN_SPRG_GEN_SCRATCH; + rfi + b . + +/* + * This is called from 0x300 and 0x400 handlers after the prologs with + * r14 and r15 containing the fault address and error code, with the + * original values stashed away in the PACA + */ +storage_fault_common: + std r14,_DAR(r1) + std r15,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r14 + mr r5,r15 + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) + INTS_RESTORE_HARD + bl .do_page_fault + cmpdi r3,0 + bne- 1f + b .ret_from_except_lite +1: bl .save_nvgprs + mr r5,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + ld r4,_DAR(r1) + bl .bad_page_fault + b .ret_from_except + +/* + * Alignment exception doesn't fit entirely in the 0x100 bytes so it + * continues here. + */ +alignment_more: + std r14,_DAR(r1) + std r15,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) + bl .save_nvgprs + INTS_RESTORE_HARD + bl .alignment_exception + b .ret_from_except + +/* + * We branch here from entry_64.S for the last stage of the exception + * return code path. MSR:EE is expected to be off at that point + */ +_GLOBAL(exception_return_book3e) + b 1f + +/* This is the return from load_up_fpu fast path which could do with + * less GPR restores in fact, but for now we have a single return path + */ + .globl fast_exception_return +fast_exception_return: + wrteei 0 +1: mr r0,r13 + ld r10,_MSR(r1) + REST_4GPRS(2, r1) + andi. r6,r10,MSR_PR + REST_2GPRS(6, r1) + beq 1f + ACCOUNT_CPU_USER_EXIT(r10, r11) + ld r0,GPR13(r1) + +1: stdcx. r0,0,r1 /* to clear the reservation */ + + ld r8,_CCR(r1) + ld r9,_LINK(r1) + ld r10,_CTR(r1) + ld r11,_XER(r1) + mtcr r8 + mtlr r9 + mtctr r10 + mtxer r11 + REST_2GPRS(8, r1) + ld r10,GPR10(r1) + ld r11,GPR11(r1) + ld r12,GPR12(r1) + mtspr SPRN_SPRG_GEN_SCRATCH,r0 + + std r10,PACA_EXGEN+EX_R10(r13); + std r11,PACA_EXGEN+EX_R11(r13); + ld r10,_NIP(r1) + ld r11,_MSR(r1) + ld r0,GPR0(r1) + ld r1,GPR1(r1) + mtspr SPRN_SRR0,r10 + mtspr SPRN_SRR1,r11 + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + mfspr r13,SPRN_SPRG_GEN_SCRATCH + rfi + +/* + * Trampolines used when spotting a bad kernel stack pointer in + * the exception entry code. + * + * TODO: move some bits like SRR0 read to trampoline, pass PACA + * index around, etc... to handle crit & mcheck + */ +BAD_STACK_TRAMPOLINE(0x000) +BAD_STACK_TRAMPOLINE(0x100) +BAD_STACK_TRAMPOLINE(0x200) +BAD_STACK_TRAMPOLINE(0x300) +BAD_STACK_TRAMPOLINE(0x400) +BAD_STACK_TRAMPOLINE(0x500) +BAD_STACK_TRAMPOLINE(0x600) +BAD_STACK_TRAMPOLINE(0x700) +BAD_STACK_TRAMPOLINE(0x800) +BAD_STACK_TRAMPOLINE(0x900) +BAD_STACK_TRAMPOLINE(0x980) +BAD_STACK_TRAMPOLINE(0x9f0) +BAD_STACK_TRAMPOLINE(0xa00) +BAD_STACK_TRAMPOLINE(0xb00) +BAD_STACK_TRAMPOLINE(0xc00) +BAD_STACK_TRAMPOLINE(0xd00) +BAD_STACK_TRAMPOLINE(0xe00) +BAD_STACK_TRAMPOLINE(0xf00) +BAD_STACK_TRAMPOLINE(0xf20) + + .globl bad_stack_book3e +bad_stack_book3e: + /* XXX: Needs to make SPRN_SPRG_GEN depend on exception type */ + mfspr r10,SPRN_SRR0; /* read SRR0 before touching stack */ + ld r1,PACAEMERGSP(r13) + subi r1,r1,64+INT_FRAME_SIZE + std r10,_NIP(r1) + std r11,_MSR(r1) + ld r10,PACA_EXGEN+EX_R1(r13) /* FIXME for crit & mcheck */ + lwz r11,PACA_EXGEN+EX_CR(r13) /* FIXME for crit & mcheck */ + std r10,GPR1(r1) + std r11,_CCR(r1) + mfspr r10,SPRN_DEAR + mfspr r11,SPRN_ESR + std r10,_DAR(r1) + std r11,_DSISR(r1) + std r0,GPR0(r1); /* save r0 in stackframe */ \ + std r2,GPR2(r1); /* save r2 in stackframe */ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ + std r9,GPR9(r1); /* save r9 in stackframe */ \ + ld r3,PACA_EXGEN+EX_R10(r13);/* get back r10 */ \ + ld r4,PACA_EXGEN+EX_R11(r13);/* get back r11 */ \ + mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 XXX can be wrong */ \ + std r3,GPR10(r1); /* save r10 to stackframe */ \ + std r4,GPR11(r1); /* save r11 to stackframe */ \ + std r12,GPR12(r1); /* save r12 in stackframe */ \ + std r5,GPR13(r1); /* save it to stackframe */ \ + mflr r10 + mfctr r11 + mfxer r12 + std r10,_LINK(r1) + std r11,_CTR(r1) + std r12,_XER(r1) + SAVE_10GPRS(14,r1) + SAVE_8GPRS(24,r1) + lhz r12,PACA_TRAP_SAVE(r13) + std r12,_TRAP(r1) + addi r11,r1,INT_FRAME_SIZE + std r11,0(r1) + li r12,0 + std r12,0(r11) + ld r2,PACATOC(r13) +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl .kernel_bad_stack + b 1b + +/* + * Setup the initial TLB for a core. This current implementation + * assume that whatever we are running off will not conflict with + * the new mapping at PAGE_OFFSET. + * We also make various assumptions about the processor we run on, + * this might have to be made more flexible based on the content + * of MMUCFG and friends. + */ +_GLOBAL(initial_tlb_book3e) + + /* Setup MAS 0,1,2,3 and 7 for tlbwe of a 1G entry that maps the + * kernel linear mapping. We also set MAS8 once for all here though + * that will have to be made dependent on whether we are running under + * a hypervisor I suppose. + */ + li r3,MAS0_HES | MAS0_WQ_ALLWAYS + mtspr SPRN_MAS0,r3 + lis r3,(MAS1_VALID | MAS1_IPROT)@h + ori r3,r3,BOOK3E_PAGESZ_1GB << MAS1_TSIZE_SHIFT + mtspr SPRN_MAS1,r3 + LOAD_REG_IMMEDIATE(r3, PAGE_OFFSET | MAS2_M) + mtspr SPRN_MAS2,r3 + li r3,MAS3_SR | MAS3_SW | MAS3_SX + mtspr SPRN_MAS7_MAS3,r3 + li r3,0 + mtspr SPRN_MAS8,r3 + + /* Write the TLB entry */ + tlbwe + + /* Now we branch the new virtual address mapped by this entry */ + LOAD_REG_IMMEDIATE(r3,1f) + mtctr r3 + bctr + +1: /* We are now running at PAGE_OFFSET, clean the TLB of everything + * else (XXX we should scan for bolted crap from the firmware too) + */ + PPC_TLBILX(0,0,0) + sync + isync + + /* We translate LR and return */ + mflr r3 + tovirt(r3,r3) + mtlr r3 + blr + +/* + * Main entry (boot CPU, thread 0) + * + * We enter here from head_64.S, possibly after the prom_init trampoline + * with r3 and r4 already saved to r31 and 30 respectively and in 64 bits + * mode. Anything else is as it was left by the bootloader + * + * Initial requirements of this port: + * + * - Kernel loaded at 0 physical + * - A good lump of memory mapped 0:0 by UTLB entry 0 + * - MSR:IS & MSR:DS set to 0 + * + * Note that some of the above requirements will be relaxed in the future + * as the kernel becomes smarter at dealing with different initial conditions + * but for now you have to be careful + */ +_GLOBAL(start_initialization_book3e) + mflr r28 + + /* First, we need to setup some initial TLBs to map the kernel + * text, data and bss at PAGE_OFFSET. We don't have a real mode + * and always use AS 0, so we just set it up to match our link + * address and never use 0 based addresses. + */ + bl .initial_tlb_book3e + + /* Init global core bits */ + bl .init_core_book3e + + /* Init per-thread bits */ + bl .init_thread_book3e + + /* Return to common init code */ + tovirt(r28,r28) + mtlr r28 + blr + + +/* + * Secondary core/processor entry + * + * This is entered for thread 0 of a secondary core, all other threads + * are expected to be stopped. It's similar to start_initialization_book3e + * except that it's generally entered from the holding loop in head_64.S + * after CPUs have been gathered by Open Firmware. + * + * We assume we are in 32 bits mode running with whatever TLB entry was + * set for us by the firmware or POR engine. + */ +_GLOBAL(book3e_secondary_core_init_tlb_set) + li r4,1 + b .generic_secondary_smp_init + +_GLOBAL(book3e_secondary_core_init) + mflr r28 + + /* Do we need to setup initial TLB entry ? */ + cmplwi r4,0 + bne 2f + + /* Setup TLB for this core */ + bl .initial_tlb_book3e + + /* We can return from the above running at a different + * address, so recalculate r2 (TOC) + */ + bl .relative_toc + + /* Init global core bits */ +2: bl .init_core_book3e + + /* Init per-thread bits */ +3: bl .init_thread_book3e + + /* Return to common init code at proper virtual address. + * + * Due to various previous assumptions, we know we entered this + * function at either the final PAGE_OFFSET mapping or using a + * 1:1 mapping at 0, so we don't bother doing a complicated check + * here, we just ensure the return address has the right top bits. + * + * Note that if we ever want to be smarter about where we can be + * started from, we have to be careful that by the time we reach + * the code below we may already be running at a different location + * than the one we were called from since initial_tlb_book3e can + * have moved us already. + */ + cmpdi cr0,r28,0 + blt 1f + lis r3,PAGE_OFFSET@highest + sldi r3,r3,32 + or r28,r28,r3 +1: mtlr r28 + blr + +_GLOBAL(book3e_secondary_thread_init) + mflr r28 + b 3b + +_STATIC(init_core_book3e) + /* Establish the interrupt vector base */ + LOAD_REG_IMMEDIATE(r3, interrupt_base_book3e) + mtspr SPRN_IVPR,r3 + sync + blr + +_STATIC(init_thread_book3e) + lis r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h + mtspr SPRN_EPCR,r3 + + /* Make sure interrupts are off */ + wrteei 0 + + /* disable watchdog and FIT and enable DEC interrupts */ + lis r3,TCR_DIE@h + mtspr SPRN_TCR,r3 + + blr + + + diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 0552f01041a..c38afdb45d7 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -121,10 +121,11 @@ __run_at_load: */ .globl __secondary_hold __secondary_hold: +#ifndef CONFIG_PPC_BOOK3E mfmsr r24 ori r24,r24,MSR_RI mtmsrd r24 /* RI on */ - +#endif /* Grab our physical cpu number */ mr r24,r3 @@ -143,6 +144,7 @@ __secondary_hold: ld r4,0(r4) /* deref function descriptor */ mtctr r4 mr r3,r24 + li r4,0 bctr #else BUG_OPCODE @@ -163,21 +165,49 @@ exception_marker: #include "exceptions-64s.S" #endif +_GLOBAL(generic_secondary_thread_init) + mr r24,r3 + + /* turn on 64-bit mode */ + bl .enable_64b_mode + + /* get a valid TOC pointer, wherever we're mapped at */ + bl .relative_toc + +#ifdef CONFIG_PPC_BOOK3E + /* Book3E initialization */ + mr r3,r24 + bl .book3e_secondary_thread_init +#endif + b generic_secondary_common_init /* * On pSeries and most other platforms, secondary processors spin * in the following code. * At entry, r3 = this processor's number (physical cpu id) + * + * On Book3E, r4 = 1 to indicate that the initial TLB entry for + * this core already exists (setup via some other mechanism such + * as SCOM before entry). */ _GLOBAL(generic_secondary_smp_init) mr r24,r3 - + mr r25,r4 + /* turn on 64-bit mode */ bl .enable_64b_mode - /* get the TOC pointer (real address) */ + /* get a valid TOC pointer, wherever we're mapped at */ bl .relative_toc +#ifdef CONFIG_PPC_BOOK3E + /* Book3E initialization */ + mr r3,r24 + mr r4,r25 + bl .book3e_secondary_core_init +#endif + +generic_secondary_common_init: /* Set up a paca value for this processor. Since we have the * physical cpu id in r24, we need to search the pacas to find * which logical id maps to our physical one. @@ -196,6 +226,11 @@ _GLOBAL(generic_secondary_smp_init) b .kexec_wait /* next kernel might do better */ 2: mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG */ +#ifdef CONFIG_PPC_BOOK3E + addi r12,r13,PACA_EXTLB /* and TLB exc frame in another */ + mtspr SPRN_SPRG_TLB_EXFRAME,r12 +#endif + /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 3: HMT_LOW @@ -231,6 +266,7 @@ _GLOBAL(generic_secondary_smp_init) * Turn the MMU off. * Assumes we're mapped EA == RA if the MMU is on. */ +#ifdef CONFIG_PPC_BOOK3S _STATIC(__mmu_off) mfmsr r3 andi. r0,r3,MSR_IR|MSR_DR @@ -242,6 +278,7 @@ _STATIC(__mmu_off) sync rfid b . /* prevent speculative execution */ +#endif /* @@ -279,6 +316,10 @@ _GLOBAL(__start_initialization_multiplatform) mr r31,r3 mr r30,r4 +#ifdef CONFIG_PPC_BOOK3E + bl .start_initialization_book3e + b .__after_prom_start +#else /* Setup some critical 970 SPRs before switching MMU off */ mfspr r0,SPRN_PVR srwi r0,r0,16 @@ -296,6 +337,7 @@ _GLOBAL(__start_initialization_multiplatform) /* Switch off MMU if not already off */ bl .__mmu_off b .__after_prom_start +#endif /* CONFIG_PPC_BOOK3E */ _INIT_STATIC(__boot_from_prom) #ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE @@ -358,10 +400,16 @@ _STATIC(__after_prom_start) * Note: This process overwrites the OF exception vectors. */ li r3,0 /* target addr */ +#ifdef CONFIG_PPC_BOOK3E + tovirt(r3,r3) /* on booke, we already run at PAGE_OFFSET */ +#endif mr. r4,r26 /* In some cases the loader may */ beq 9f /* have already put us at zero */ li r6,0x100 /* Start offset, the first 0x100 */ /* bytes were copied earlier. */ +#ifdef CONFIG_PPC_BOOK3E + tovirt(r6,r6) /* on booke, we already run at PAGE_OFFSET */ +#endif #ifdef CONFIG_CRASH_DUMP /* @@ -507,6 +555,9 @@ _GLOBAL(pmac_secondary_start) * r13 = paca virtual address * SPRG_PACA = paca virtual address */ + .section ".text"; + .align 2 ; + .globl __secondary_start __secondary_start: /* Set thread priority to MEDIUM */ @@ -543,7 +594,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI b . /* prevent speculative execution */ /* @@ -564,11 +615,16 @@ _GLOBAL(start_secondary_prolog) */ _GLOBAL(enable_64b_mode) mfmsr r11 /* grab the current MSR */ +#ifdef CONFIG_PPC_BOOK3E + oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */ + mtmsr r11 +#else /* CONFIG_PPC_BOOK3E */ li r12,(MSR_SF | MSR_ISF)@highest sldi r12,r12,48 or r11,r11,r12 mtmsrd r11 isync +#endif blr /* @@ -612,9 +668,11 @@ _INIT_STATIC(start_here_multiplatform) bdnz 3b 4: +#ifndef CONFIG_PPC_BOOK3E mfmsr r6 ori r6,r6,MSR_RI mtmsrd r6 /* RI on */ +#endif #ifdef CONFIG_RELOCATABLE /* Save the physical address we're running at in kernstart_addr */ @@ -647,7 +705,7 @@ _INIT_STATIC(start_here_multiplatform) ld r4,PACAKMSR(r13) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI b . /* prevent speculative execution */ /* This is where all platforms converge execution */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 65aced7b833..87df5172064 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -454,6 +454,24 @@ static void __init irqstack_early_init(void) #define irqstack_early_init() #endif +#ifdef CONFIG_PPC_BOOK3E +static void __init exc_lvl_early_init(void) +{ + unsigned int i; + + for_each_possible_cpu(i) { + critirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc(THREAD_SIZE, THREAD_SIZE)); + dbgirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc(THREAD_SIZE, THREAD_SIZE)); + mcheckirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc(THREAD_SIZE, THREAD_SIZE)); + } +} +#else +#define exc_lvl_early_init() +#endif + /* * Stack space used when we detect a bad kernel stack pointer, and * early in SMP boots before relocation is enabled. @@ -513,6 +531,7 @@ void __init setup_arch(char **cmdline_p) init_mm.brk = klimit; irqstack_early_init(); + exc_lvl_early_init(); emergency_stack_init(); #ifdef CONFIG_PPC_STD_MMU_64 diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3e68363405b..6fb8fc8d2fe 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,6 +13,7 @@ obj-y := fault.o mem.o pgtable.o gup.o \ pgtable_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o +obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o obj-$(CONFIG_PPC64) += mmap_64.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 61187bec750..9efc8bda01b 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -57,15 +57,35 @@ config E200 endchoice -config PPC_BOOK3S_64 - def_bool y +choice + prompt "Processor Type" depends on PPC64 + help + There are two families of 64 bit PowerPC chips supported. + The most common ones are the desktop and server CPUs + (POWER3, RS64, POWER4, POWER5, POWER5+, POWER6, ...) + + The other are the "embedded" processors compliant with the + "Book 3E" variant of the architecture + +config PPC_BOOK3S_64 + bool "Server processors" select PPC_FPU +config PPC_BOOK3E_64 + bool "Embedded processors" + select PPC_FPU # Make it a choice ? + +endchoice + config PPC_BOOK3S def_bool y depends on PPC_BOOK3S_32 || PPC_BOOK3S_64 +config PPC_BOOK3E + def_bool y + depends on PPC_BOOK3E_64 + config POWER4_ONLY bool "Optimize for POWER4" depends on PPC64 && PPC_BOOK3S @@ -125,7 +145,7 @@ config 4xx config BOOKE bool - depends on E200 || E500 || 44x + depends on E200 || E500 || 44x || PPC_BOOK3E default y config FSL_BOOKE @@ -223,9 +243,17 @@ config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU +config PPC_MMU_NOHASH_32 + def_bool y + depends on PPC_MMU_NOHASH && PPC32 + +config PPC_MMU_NOHASH_64 + def_bool y + depends on PPC_MMU_NOHASH && PPC64 + config PPC_BOOK3E_MMU def_bool y - depends on FSL_BOOKE + depends on FSL_BOOKE || PPC_BOOK3E config PPC_MM_SLICES bool @@ -257,7 +285,7 @@ config PPC_PERF_CTRS This enables the powerpc-specific perf_counter back-end. config SMP - depends on PPC_STD_MMU || FSL_BOOKE + depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE bool "Symmetric multi-processing support" ---help--- This enables support for systems with more than one CPU. If you have diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e1f33a81e5e..0e09a45ac79 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2570,7 +2570,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 static void dump_slb(void) { int i; -- cgit v1.2.3-70-g09d2 From af984b816530b4725b92e01ecfba7c5e3eab910d Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 6 Aug 2009 13:50:58 +1000 Subject: powerpc/mm: Fix encoding of page table cache numbers The mask used to encode the page table cache number in the batch when freeing page tables was too small for the new possible values of MMU page sizes. This increases it along with a comment explaining the constraints. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgalloc.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 34b080671f0..f2e812de7c3 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -28,7 +28,12 @@ typedef struct pgtable_free { unsigned long val; } pgtable_free_t; -#define PGF_CACHENUM_MASK 0x7 +/* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored + * and small enough to fit in the low bits of any naturally aligned page + * table cache entry. Arbitrarily set to 0x1f, that should give us some + * room to grow + */ +#define PGF_CACHENUM_MASK 0x1f static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, unsigned long mask) -- cgit v1.2.3-70-g09d2 From 9413c8836a16e9d034928a7f9d3ad81bebd71ce9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 Jul 2009 02:06:42 +0000 Subject: powerpc/cell: Move CBE_IOPTE_* to As doesn't contain any other hardware specific definitions but only interfaces. Reported-by: Arnd Bergmann Signed-off-by: Geert Uytterhoeven Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/cell-regs.h | 11 +++++++++++ arch/powerpc/include/asm/iommu.h | 10 ---------- arch/powerpc/platforms/ps3/mm.c | 2 +- arch/powerpc/platforms/ps3/system-bus.c | 2 +- drivers/block/ps3vram.c | 2 +- drivers/video/ps3fb.c | 2 +- 6 files changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/cell-regs.h b/arch/powerpc/include/asm/cell-regs.h index fd6fd00434e..fdf64fd2595 100644 --- a/arch/powerpc/include/asm/cell-regs.h +++ b/arch/powerpc/include/asm/cell-regs.h @@ -303,6 +303,17 @@ struct cbe_mic_tm_regs { extern struct cbe_mic_tm_regs __iomem *cbe_get_mic_tm_regs(struct device_node *np); extern struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu); + +/* Cell page table entries */ +#define CBE_IOPTE_PP_W 0x8000000000000000ul /* protection: write */ +#define CBE_IOPTE_PP_R 0x4000000000000000ul /* protection: read */ +#define CBE_IOPTE_M 0x2000000000000000ul /* coherency required */ +#define CBE_IOPTE_SO_R 0x1000000000000000ul /* ordering: writes */ +#define CBE_IOPTE_SO_RW 0x1800000000000000ul /* ordering: r & w */ +#define CBE_IOPTE_RPN_Mask 0x07fffffffffff000ul /* RPN */ +#define CBE_IOPTE_H 0x0000000000000800ul /* cache hint */ +#define CBE_IOPTE_IOID_Mask 0x00000000000007fful /* ioid */ + /* some utility functions to deal with SMT */ extern u32 cbe_get_hw_thread_id(int cpu); extern u32 cbe_cpu_to_node(int cpu); diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 7ead7c16fb7..7464c0daddd 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -35,16 +35,6 @@ #define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1)) #define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE) -/* Cell page table entries */ -#define CBE_IOPTE_PP_W 0x8000000000000000ul /* protection: write */ -#define CBE_IOPTE_PP_R 0x4000000000000000ul /* protection: read */ -#define CBE_IOPTE_M 0x2000000000000000ul /* coherency required */ -#define CBE_IOPTE_SO_R 0x1000000000000000ul /* ordering: writes */ -#define CBE_IOPTE_SO_RW 0x1800000000000000ul /* ordering: r & w */ -#define CBE_IOPTE_RPN_Mask 0x07fffffffffff000ul /* RPN */ -#define CBE_IOPTE_H 0x0000000000000800ul /* cache hint */ -#define CBE_IOPTE_IOID_Mask 0x00000000000007fful /* ioid */ - /* Boot time flags */ extern int iommu_is_off; extern int iommu_force_on; diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index 846eb8b57fd..189a25b8073 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -23,8 +23,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 3f763c5284a..676f989ed4e 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include "platform.h" diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 095f97e6066..c8753a9ed29 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -13,8 +13,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/drivers/video/ps3fb.c b/drivers/video/ps3fb.c index c0af638fe70..9c0144ee7ae 100644 --- a/drivers/video/ps3fb.c +++ b/drivers/video/ps3fb.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 6826a57d1abc8ac9f59b24f1a008554c6560a995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Aug 2009 12:24:45 +0000 Subject: powerpc: Switch to asm-generic/hardirq.h hardirq.h on powerpc defines a __last_jiffy_stamp field, but it's not actually used anywhere. Signed-off-by: Christoph Hellwig Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/hardirq.h | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h index 288e14d53b7..fb3c05a0cbb 100644 --- a/arch/powerpc/include/asm/hardirq.h +++ b/arch/powerpc/include/asm/hardirq.h @@ -1,29 +1 @@ -#ifndef _ASM_POWERPC_HARDIRQ_H -#define _ASM_POWERPC_HARDIRQ_H -#ifdef __KERNEL__ - -#include -#include - -/* The __last_jiffy_stamp field is needed to ensure that no decrementer - * interrupt is lost on SMP machines. Since on most CPUs it is in the same - * cache line as local_irq_count, it is cheap to access and is also used on UP - * for uniformity. - */ -typedef struct { - unsigned int __softirq_pending; /* set_bit is used on this */ - unsigned int __last_jiffy_stamp; -} ____cacheline_aligned irq_cpustat_t; - -#include /* Standard mappings for irq_cpustat_t above */ - -#define last_jiffy_stamp(cpu) __IRQ_STAT((cpu), __last_jiffy_stamp) - -static inline void ack_bad_irq(int irq) -{ - printk(KERN_CRIT "illegal vector %d received!\n", irq); - BUG(); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_HARDIRQ_H */ +#include -- cgit v1.2.3-70-g09d2 From fc4bdb35fba1c8f464fd85b94a5059e752fc85d4 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 14 Aug 2009 09:38:34 -0500 Subject: powerpc/booke: Move MMUCSR definition into mmu-book3e.h The MMUCSR is now defined as part of the Book-3E architecture so we can move it into mmu-book3e.h and add some of the additional bits defined by the architecture specs. Signed-off-by: Kumar Gala --- arch/powerpc/include/asm/mmu-book3e.h | 12 ++++++++++++ arch/powerpc/include/asm/reg_booke.h | 6 ------ arch/powerpc/mm/tlb_nohash_low.S | 2 -- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index d7458046936..74695816205 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -114,6 +114,18 @@ #define MAS7_RPN 0xFFFFFFFF +/* Bit definitions for MMUCSR0 */ +#define MMUCSR0_TLB1FI 0x00000002 /* TLB1 Flash invalidate */ +#define MMUCSR0_TLB0FI 0x00000004 /* TLB0 Flash invalidate */ +#define MMUCSR0_TLB2FI 0x00000040 /* TLB2 Flash invalidate */ +#define MMUCSR0_TLB3FI 0x00000020 /* TLB3 Flash invalidate */ +#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \ + MMUCSR0_TLB2FI | MMUCSR0_TLB3FI) +#define MMUCSR0_TLB0PS 0x00000780 /* TLB0 Page Size */ +#define MMUCSR0_TLB1PS 0x00007800 /* TLB1 Page Size */ +#define MMUCSR0_TLB2PS 0x00078000 /* TLB2 Page Size */ +#define MMUCSR0_TLB3PS 0x00780000 /* TLB3 Page Size */ + /* TLBnCFG encoding */ #define TLBnCFG_N_ENTRY 0x00000fff /* number of entries */ #define TLBnCFG_HES 0x00002000 /* HW select supported */ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 2c9c706e644..9bb81d99b76 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -430,12 +430,6 @@ #define L2CSR0_L2LOA 0x00000080 /* L2 Cache Lock Overflow Allocate */ #define L2CSR0_L2LO 0x00000020 /* L2 Cache Lock Overflow */ -/* Bit definitions for MMUCSR0 */ -#define MMUCSR0_TLB1FI 0x00000002 /* TLB1 Flash invalidate */ -#define MMUCSR0_TLB0FI 0x00000004 /* TLB0 Flash invalidate */ -#define MMUCSR0_TLB2FI 0x00000040 /* TLB2 Flash invalidate */ -#define MMUCSR0_TLB3FI 0x00000020 /* TLB3 Flash invalidate */ - /* Bit definitions for SGR. */ #define SGR_NORMAL 0 /* Speculative fetching allowed. */ #define SGR_GUARDED 1 /* Speculative fetching disallowed. */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 7bcd9fbf6cc..bbdc5b577b8 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -124,8 +124,6 @@ _GLOBAL(_tlbil_pid) * to have the larger code path before the _SECTION_ELSE */ -#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \ - MMUCSR0_TLB2FI | MMUCSR0_TLB3FI) /* * Flush MMU TLB on the local processor */ -- cgit v1.2.3-70-g09d2 From ea3cc330ac0cd521ff07c7cd432a1848c19a7e92 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 18 Aug 2009 19:00:34 +0000 Subject: powerpc/mm: Cleanup handling of execute permission This is an attempt at cleaning up a bit the way we handle execute permission on powerpc. _PAGE_HWEXEC is gone, _PAGE_EXEC is now only defined by CPUs that can do something with it, and the myriad of #ifdef's in the I$/D$ coherency code is reduced to 2 cases that hopefully should cover everything. The logic on BookE is a little bit different than what it was though not by much. Since now, _PAGE_EXEC will be set by the generic code for executable pages, we need to filter out if they are unclean and recover it. However, I don't expect the code to be more bloated than it already was in that area due to that change. I could boast that this brings proper enforcing of per-page execute permissions to all BookE and 40x but in fact, we've had that now for some time as a side effect of my previous rework in that area (and I didn't even know it :-) We would only enable execute permission if the page was cache clean and we would only cache clean it if we took and exec fault. Since we now enforce that the later only work if VM_EXEC is part of the VMA flags, we de-fact already enforce per-page execute permissions... Unless I missed something Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc32.h | 7 +- arch/powerpc/include/asm/pgtable-ppc64.h | 3 +- arch/powerpc/include/asm/pte-40x.h | 2 +- arch/powerpc/include/asm/pte-44x.h | 2 +- arch/powerpc/include/asm/pte-8xx.h | 1 - arch/powerpc/include/asm/pte-book3e.h | 13 +-- arch/powerpc/include/asm/pte-common.h | 22 ++-- arch/powerpc/include/asm/pte-fsl-booke.h | 2 +- arch/powerpc/include/asm/pte-hash32.h | 1 - arch/powerpc/kernel/head_44x.S | 2 +- arch/powerpc/kernel/head_fsl_booke.S | 4 +- arch/powerpc/mm/40x_mmu.c | 4 +- arch/powerpc/mm/pgtable.c | 167 +++++++++++++++++++++---------- arch/powerpc/mm/pgtable_32.c | 2 +- arch/powerpc/mm/tlb_low_64e.S | 4 +- 15 files changed, 149 insertions(+), 87 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index c9ff9d75990..f2c52e25395 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -186,7 +186,7 @@ static inline unsigned long pte_update(pte_t *p, #endif /* !PTE_ATOMIC_UPDATES */ #ifdef CONFIG_44x - if ((old & _PAGE_USER) && (old & _PAGE_HWEXEC)) + if ((old & _PAGE_USER) && (old & _PAGE_EXEC)) icache_44x_need_flush = 1; #endif return old; @@ -217,7 +217,7 @@ static inline unsigned long long pte_update(pte_t *p, #endif /* !PTE_ATOMIC_UPDATES */ #ifdef CONFIG_44x - if ((old & _PAGE_USER) && (old & _PAGE_HWEXEC)) + if ((old & _PAGE_USER) && (old & _PAGE_EXEC)) icache_44x_need_flush = 1; #endif return old; @@ -267,8 +267,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) { unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | - _PAGE_HWEXEC | _PAGE_EXEC); + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); pte_update(ptep, 0, bits); } diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 200ec2dfa03..806abe7a3fa 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -313,8 +313,7 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) { unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | - _PAGE_EXEC | _PAGE_HWEXEC); + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); #ifdef PTE_ATOMIC_UPDATES unsigned long old, tmp; diff --git a/arch/powerpc/include/asm/pte-40x.h b/arch/powerpc/include/asm/pte-40x.h index 07630faae02..6c3e1f4378d 100644 --- a/arch/powerpc/include/asm/pte-40x.h +++ b/arch/powerpc/include/asm/pte-40x.h @@ -46,7 +46,7 @@ #define _PAGE_RW 0x040 /* software: Writes permitted */ #define _PAGE_DIRTY 0x080 /* software: dirty page */ #define _PAGE_HWWRITE 0x100 /* hardware: Dirty & RW, set in exception */ -#define _PAGE_HWEXEC 0x200 /* hardware: EX permission */ +#define _PAGE_EXEC 0x200 /* hardware: EX permission */ #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ #define _PMD_PRESENT 0x400 /* PMD points to page of PTEs */ diff --git a/arch/powerpc/include/asm/pte-44x.h b/arch/powerpc/include/asm/pte-44x.h index 37e98bcf83e..4192b9bad90 100644 --- a/arch/powerpc/include/asm/pte-44x.h +++ b/arch/powerpc/include/asm/pte-44x.h @@ -78,7 +78,7 @@ #define _PAGE_PRESENT 0x00000001 /* S: PTE valid */ #define _PAGE_RW 0x00000002 /* S: Write permission */ #define _PAGE_FILE 0x00000004 /* S: nonlinear file mapping */ -#define _PAGE_HWEXEC 0x00000004 /* H: Execute permission */ +#define _PAGE_EXEC 0x00000004 /* H: Execute permission */ #define _PAGE_ACCESSED 0x00000008 /* S: Page referenced */ #define _PAGE_DIRTY 0x00000010 /* S: Page dirty */ #define _PAGE_SPECIAL 0x00000020 /* S: Special page */ diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h index 8c6e3125103..94e979718dc 100644 --- a/arch/powerpc/include/asm/pte-8xx.h +++ b/arch/powerpc/include/asm/pte-8xx.h @@ -36,7 +36,6 @@ /* These five software bits must be masked out when the entry is loaded * into the TLB. */ -#define _PAGE_EXEC 0x0008 /* software: i-cache coherency required */ #define _PAGE_GUARDED 0x0010 /* software: guarded access */ #define _PAGE_DIRTY 0x0020 /* software: page changed */ #define _PAGE_RW 0x0040 /* software: user write access allowed */ diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 1d27c77d770..9800565aebb 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -37,12 +37,13 @@ #define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */ /* "Higher level" linux bit combinations */ -#define _PAGE_EXEC _PAGE_BAP_SX /* Can be executed from potentially */ -#define _PAGE_HWEXEC _PAGE_BAP_UX /* .. and was cache cleaned */ -#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ -#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) -#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) -#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ +#define _PAGE_EXEC _PAGE_BAP_UX /* .. and was cache cleaned */ +#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ +#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) +#define _PAGE_KERNEL_RWX (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY | _PAGE_BAP_SX) +#define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) +#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ #define _PAGE_HASHPTE 0 #define _PAGE_BUSY 0 diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 8bb6464ba61..c3b65076a26 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -13,9 +13,6 @@ #ifndef _PAGE_HWWRITE #define _PAGE_HWWRITE 0 #endif -#ifndef _PAGE_HWEXEC -#define _PAGE_HWEXEC 0 -#endif #ifndef _PAGE_EXEC #define _PAGE_EXEC 0 #endif @@ -48,10 +45,16 @@ #define PMD_PAGE_SIZE(pmd) bad_call_to_PMD_PAGE_SIZE() #endif #ifndef _PAGE_KERNEL_RO -#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_RO 0 +#endif +#ifndef _PAGE_KERNEL_ROX +#define _PAGE_KERNEL_ROX (_PAGE_EXEC) #endif #ifndef _PAGE_KERNEL_RW -#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE) +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE) +#endif +#ifndef _PAGE_KERNEL_RWX +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE | _PAGE_EXEC) #endif #ifndef _PAGE_HPTEFLAGS #define _PAGE_HPTEFLAGS _PAGE_HASHPTE @@ -96,8 +99,7 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ _PAGE_WRITETHRU | _PAGE_ENDIAN | _PAGE_4K_PFN | \ _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | \ - _PAGE_EXEC | _PAGE_HWEXEC) + _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) /* * We define 2 sets of base prot bits, one for basic pages (ie, @@ -154,11 +156,9 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); _PAGE_NO_CACHE) #define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ _PAGE_NO_CACHE | _PAGE_GUARDED) -#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW | _PAGE_EXEC | \ - _PAGE_HWEXEC) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO | _PAGE_EXEC | \ - _PAGE_HWEXEC) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) /* Protection used for kernel text. We want the debuggers to be able to * set breakpoints anywhere, so don't write protect the kernel text diff --git a/arch/powerpc/include/asm/pte-fsl-booke.h b/arch/powerpc/include/asm/pte-fsl-booke.h index 10820f58acf..ce8a9e94ce7 100644 --- a/arch/powerpc/include/asm/pte-fsl-booke.h +++ b/arch/powerpc/include/asm/pte-fsl-booke.h @@ -23,7 +23,7 @@ #define _PAGE_FILE 0x00002 /* S: when !present: nonlinear file mapping */ #define _PAGE_RW 0x00004 /* S: Write permission (SW) */ #define _PAGE_DIRTY 0x00008 /* S: Page dirty */ -#define _PAGE_HWEXEC 0x00010 /* H: SX permission */ +#define _PAGE_EXEC 0x00010 /* H: SX permission */ #define _PAGE_ACCESSED 0x00020 /* S: Page referenced */ #define _PAGE_ENDIAN 0x00040 /* H: E bit */ diff --git a/arch/powerpc/include/asm/pte-hash32.h b/arch/powerpc/include/asm/pte-hash32.h index 16e571c7f9e..4aad4132d0a 100644 --- a/arch/powerpc/include/asm/pte-hash32.h +++ b/arch/powerpc/include/asm/pte-hash32.h @@ -26,7 +26,6 @@ #define _PAGE_WRITETHRU 0x040 /* W: cache write-through */ #define _PAGE_DIRTY 0x080 /* C: page changed */ #define _PAGE_ACCESSED 0x100 /* R: page referenced */ -#define _PAGE_EXEC 0x200 /* software: i-cache coherency required */ #define _PAGE_RW 0x400 /* software: user write access allowed */ #define _PAGE_SPECIAL 0x800 /* software: Special page */ diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 656cfb2d666..711368b993f 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -497,7 +497,7 @@ tlb_44x_patch_hwater_D: mtspr SPRN_MMUCR,r12 /* Make up the required permissions */ - li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_HWEXEC + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC /* Compute pgdir/pmd offset */ rlwinm r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index eca80482ae7..2c5af525647 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -643,7 +643,7 @@ interrupt_base: 4: /* Make up the required permissions */ - li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_HWEXEC + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC FIND_PTE andc. r13,r13,r11 /* Check permission */ @@ -742,7 +742,7 @@ finish_tlb_load: #endif mtspr SPRN_MAS2, r12 - li r10, (_PAGE_HWEXEC | _PAGE_PRESENT) + li r10, (_PAGE_EXEC | _PAGE_PRESENT) rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ and r12, r11, r10 andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 29954dc2894..f5e7b9ce63d 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -105,7 +105,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_16M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp++) = val; @@ -120,7 +120,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_4M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp) = val; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index b6b32487e74..83f1551ec2c 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -128,28 +128,6 @@ void pte_free_finish(void) #endif /* CONFIG_SMP */ -/* - * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags() - */ -static pte_t do_dcache_icache_coherency(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - struct page *page; - - if (unlikely(!pfn_valid(pfn))) - return pte; - page = pfn_to_page(pfn); - - if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) { - pr_devel("do_dcache_icache_coherency... flushing\n"); - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } - else - pr_devel("do_dcache_icache_coherency... already clean\n"); - return __pte(pte_val(pte) | _PAGE_HWEXEC); -} - static inline int is_exec_fault(void) { return current->thread.regs && TRAP(current->thread.regs) == 0x400; @@ -157,49 +135,139 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE + * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) == - (_PAGE_PRESENT); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); } -#if defined(CONFIG_PPC_STD_MMU) +struct page * maybe_pte_to_page(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + struct page *page; + + if (unlikely(!pfn_valid(pfn))) + return NULL; + page = pfn_to_page(pfn); + if (PageReserved(page)) + return NULL; + return page; +} + +#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 + /* Server-style MMU handles coherency when hashing if HW exec permission - * is supposed per page (currently 64-bit only). Else, we always flush - * valid PTEs in set_pte. + * is supposed per page (currently 64-bit only). If not, then, we always + * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec + * support falls into the same category. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_pte_filter(pte_t pte) { - return set_pte && pte_looks_normal(pte) && - !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE)); + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct page *pg = maybe_pte_to_page(pte); + if (!pg) + return pte; + if (!test_bit(PG_arch_1, &pg->flags)) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + } + } + return pte; } -#elif _PAGE_HWEXEC == 0 -/* Embedded type MMU without HW exec support (8xx only so far), we flush - * the cache for any present PTE - */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) { - return set_pte && pte_looks_normal(pte); + return pte; } -#else -/* Other embedded CPUs with HW exec support per-page, we flush on exec - * fault if HWEXEC is not set + +#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ + +/* Embedded type MMU with HW exec support. This is a bit more complicated + * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so + * instead we "filter out" the exec permission for non clean pages. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) +static pte_t set_pte_filter(pte_t pte) { - return pte_looks_normal(pte) && is_exec_fault() && - !(pte_val(pte) & _PAGE_HWEXEC); + struct page *pg; + + /* No exec permission in the first place, move on */ + if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + return pte; + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + return pte; + + /* If the page clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + return pte; + + /* If it's an exec fault, we flush the cache and make it clean */ + if (is_exec_fault()) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + return pte; + } + + /* Else, we filter out _PAGE_EXEC */ + return __pte(pte_val(pte) & ~_PAGE_EXEC); } -#endif + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + struct page *pg; + + /* So here, we only care about exec faults, as we use them + * to recover lost _PAGE_EXEC and perform I$/D$ coherency + * if necessary. Also if _PAGE_EXEC is already set, same deal, + * we just bail out + */ + if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + return pte; + +#ifdef CONFIG_DEBUG_VM + /* So this is an exec fault, _PAGE_EXEC is not set. If it was + * an error we would have bailed out earlier in do_page_fault() + * but let's make sure of it + */ + if (WARN_ON(!(vma->vm_flags & VM_EXEC))) + return pte; +#endif /* CONFIG_DEBUG_VM */ + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + goto bail; + + /* If the page is already clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + goto bail; + + /* Clean the page and set PG_arch_1 */ + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + + bail: + return __pte(pte_val(pte) | _PAGE_EXEC); +} + +#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte) { #ifdef CONFIG_DEBUG_VM WARN_ON(pte_present(*ptep)); @@ -208,9 +276,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte * this context might not have been activated yet when this * is called. */ - pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_need_exec_flush(pte, 1)) - pte = do_dcache_icache_coherency(pte); + pte = set_pte_filter(pte); /* Perform the setting of the PTE */ __set_pte_at(mm, addr, ptep, pte, 0); @@ -227,8 +293,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed; - if (!dirty && pte_need_exec_flush(entry, 0)) - entry = do_dcache_icache_coherency(entry); + entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { if (!(vma->vm_flags & VM_HUGETLB)) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 5422169626b..cb96cb2e17c 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -142,7 +142,7 @@ ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags) flags |= _PAGE_DIRTY | _PAGE_HWWRITE; /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC); + flags &= ~(_PAGE_USER | _PAGE_EXEC); return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 10d524ded7b..cd92f62f9cf 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -133,7 +133,7 @@ /* We do the user/kernel test for the PID here along with the RW test */ - li r11,_PAGE_PRESENT|_PAGE_HWEXEC /* Base perm */ + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h cmpldi cr0,r15,0 /* Check for user region */ @@ -256,7 +256,7 @@ normal_tlb_miss_done: normal_tlb_miss_access_fault: /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_HWEXEC + andi. r10,r11,_PAGE_EXEC bne 1f ld r14,EX_TLB_DEAR(r12) ld r15,EX_TLB_ESR(r12) -- cgit v1.2.3-70-g09d2 From 762afb7317b1987fa0851135fe4f2947f68c3c2a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:22 +0000 Subject: powerpc: Remove addr_needs_map in struct dma_mapping_ops This patch adds max_direct_dma_addr to struct dev_archdata to remove addr_needs_map in struct dma_mapping_ops. It also converts dma_capable() to use max_direct_dma_addr. max_direct_dma_addr is initialized in pci_dma_dev_setup_swiotlb(), called via ppc_md.pci_dma_dev_setup hook. For further information: http://marc.info/?t=124719060200001&r=1&w=2 Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/device.h | 3 +++ arch/powerpc/include/asm/dma-mapping.h | 8 +++---- arch/powerpc/include/asm/swiotlb.h | 5 ++--- arch/powerpc/kernel/dma-swiotlb.c | 36 +++++++++++++----------------- arch/powerpc/platforms/85xx/mpc8536_ds.c | 1 + arch/powerpc/platforms/85xx/mpc85xx_ds.c | 1 + arch/powerpc/platforms/85xx/mpc85xx_mds.c | 1 + arch/powerpc/platforms/86xx/mpc86xx_hpcn.c | 1 + 8 files changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 7d2277cef09..0086f8d46f1 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -16,6 +16,9 @@ struct dev_archdata { /* DMA operations on that device */ struct dma_mapping_ops *dma_ops; void *dma_data; +#ifdef CONFIG_SWIOTLB + dma_addr_t max_direct_dma_addr; +#endif }; static inline void dev_archdata_set_node(struct dev_archdata *ad, diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 0c34371ec49..1765c379138 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -87,8 +87,6 @@ struct dma_mapping_ops { dma_addr_t dma_address, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs); - int (*addr_needs_map)(struct device *dev, dma_addr_t addr, - size_t size); #ifdef CONFIG_PPC_NEED_DMA_SYNC_OPS void (*sync_single_range_for_cpu)(struct device *hwdev, dma_addr_t dma_handle, unsigned long offset, @@ -426,10 +424,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { - struct dma_mapping_ops *ops = get_dma_ops(dev); +#ifdef CONFIG_SWIOTLB + struct dev_archdata *sd = &dev->archdata; - if (ops->addr_needs_map && ops->addr_needs_map(dev, addr, size)) + if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr) return 0; +#endif if (!dev->dma_mask) return 0; diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 30891d6e2bc..31e0e43c880 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -16,12 +16,11 @@ extern struct dma_mapping_ops swiotlb_dma_ops; extern struct dma_mapping_ops swiotlb_pci_dma_ops; -int swiotlb_arch_address_needs_mapping(struct device *, dma_addr_t, - size_t size); - static inline void dma_mark_clean(void *addr, size_t size) {} extern unsigned int ppc_swiotlb_enable; int __init swiotlb_setup_bus_notifier(void); +extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev); + #endif /* __ASM_SWIOTLB_H */ diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index e8a57de85bc..c9f6a302e87 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -24,26 +24,6 @@ int swiotlb __read_mostly; unsigned int ppc_swiotlb_enable; -/* - * Determine if an address is reachable by a pci device, or if we must bounce. - */ -static int -swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size) -{ - dma_addr_t max; - struct pci_controller *hose; - struct pci_dev *pdev = to_pci_dev(hwdev); - - hose = pci_bus_to_host(pdev->bus); - max = hose->dma_window_base_cur + hose->dma_window_size; - - /* check that we're within mapped pci window space */ - if ((addr + size > max) | (addr < hose->dma_window_base_cur)) - return 1; - - return 0; -} - /* * At the moment, all platforms that use this code only require * swiotlb to be used if we're operating on HIGHMEM. Since @@ -73,22 +53,36 @@ struct dma_mapping_ops swiotlb_pci_dma_ops = { .dma_supported = swiotlb_dma_supported, .map_page = swiotlb_map_page, .unmap_page = swiotlb_unmap_page, - .addr_needs_map = swiotlb_pci_addr_needs_map, .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device }; +void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) +{ + struct pci_controller *hose; + struct dev_archdata *sd; + + hose = pci_bus_to_host(pdev->bus); + sd = &pdev->dev.archdata; + sd->max_direct_dma_addr = + hose->dma_window_base_cur + hose->dma_window_size; +} + static int ppc_swiotlb_bus_notify(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; + struct dev_archdata *sd; /* We are only intereted in device addition */ if (action != BUS_NOTIFY_ADD_DEVICE) return 0; + sd = &dev->archdata; + sd->max_direct_dma_addr = 0; + /* May need to bounce if the device can't address all of DRAM */ if (dma_get_mask(dev) < lmb_end_of_DRAM()) set_dma_ops(dev, &swiotlb_dma_ops); diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c index 055ff417bae..bf052c05610 100644 --- a/arch/powerpc/platforms/85xx/mpc8536_ds.c +++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -97,6 +97,7 @@ static void __init mpc8536_ds_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c index 849c0ac0025..c6f92ccd963 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -193,6 +193,7 @@ static void __init mpc85xx_ds_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 20a61d0af33..25998b661f2 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -256,6 +256,7 @@ static void __init mpc85xx_mds_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif } diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c index 66327024a6a..80323015687 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c @@ -106,6 +106,7 @@ mpc86xx_hpcn_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif } -- cgit v1.2.3-70-g09d2 From 3702977fa7d1a1a95caa387121fa7c9f4cae35f3 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:23 +0000 Subject: powerpc: Remove swiotlb_pci_dma_ops Now swiotlb_pci_dma_ops is identical to swiotlb_dma_ops; we can use swiotlb_dma_ops with any devices. This removes swiotlb_pci_dma_ops. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/swiotlb.h | 1 - arch/powerpc/kernel/dma-swiotlb.c | 14 -------------- arch/powerpc/platforms/85xx/mpc8536_ds.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_ds.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_mds.c | 2 +- arch/powerpc/platforms/86xx/mpc86xx_hpcn.c | 2 +- 6 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 31e0e43c880..21ce0a3b494 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -14,7 +14,6 @@ #include extern struct dma_mapping_ops swiotlb_dma_ops; -extern struct dma_mapping_ops swiotlb_pci_dma_ops; static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index c9f6a302e87..ca141e108ae 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -45,20 +45,6 @@ struct dma_mapping_ops swiotlb_dma_ops = { .sync_sg_for_device = swiotlb_sync_sg_for_device }; -struct dma_mapping_ops swiotlb_pci_dma_ops = { - .alloc_coherent = dma_direct_alloc_coherent, - .free_coherent = dma_direct_free_coherent, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .dma_supported = swiotlb_dma_supported, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, - .sync_single_range_for_device = swiotlb_sync_single_range_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device -}; - void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) { struct pci_controller *hose; diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c index bf052c05610..004b7d36cdb 100644 --- a/arch/powerpc/platforms/85xx/mpc8536_ds.c +++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -96,7 +96,7 @@ static void __init mpc8536_ds_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c index c6f92ccd963..544011a562f 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -192,7 +192,7 @@ static void __init mpc85xx_ds_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 25998b661f2..3909d57b86e 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -255,7 +255,7 @@ static void __init mpc85xx_mds_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c index 80323015687..2aa69a69bcc 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c @@ -105,7 +105,7 @@ mpc86xx_hpcn_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif -- cgit v1.2.3-70-g09d2 From 45223c549273bbb2c6e1bc6e3629174e8765ad01 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:25 +0000 Subject: powerpc: use dma_map_ops struct This converts uses dma_map_ops struct (in include/linux/dma-mapping.h) instead of POWERPC homegrown dma_mapping_ops. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/device.h | 4 +- arch/powerpc/include/asm/dma-mapping.h | 84 +++++++++------------------------ arch/powerpc/include/asm/pci.h | 4 +- arch/powerpc/include/asm/swiotlb.h | 2 +- arch/powerpc/kernel/dma-iommu.c | 2 +- arch/powerpc/kernel/dma-swiotlb.c | 2 +- arch/powerpc/kernel/dma.c | 2 +- arch/powerpc/kernel/ibmebus.c | 2 +- arch/powerpc/kernel/pci-common.c | 6 +-- arch/powerpc/kernel/vio.c | 2 +- arch/powerpc/platforms/cell/iommu.c | 2 +- arch/powerpc/platforms/ps3/system-bus.c | 4 +- 12 files changed, 37 insertions(+), 79 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 0086f8d46f1..67fcd7f89d9 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -6,7 +6,7 @@ #ifndef _ASM_POWERPC_DEVICE_H #define _ASM_POWERPC_DEVICE_H -struct dma_mapping_ops; +struct dma_map_ops; struct device_node; struct dev_archdata { @@ -14,7 +14,7 @@ struct dev_archdata { struct device_node *of_node; /* DMA operations on that device */ - struct dma_mapping_ops *dma_ops; + struct dma_map_ops *dma_ops; void *dma_data; #ifdef CONFIG_SWIOTLB dma_addr_t max_direct_dma_addr; diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 1765c379138..8ca2b5183c5 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -63,57 +63,15 @@ static inline unsigned long device_to_mask(struct device *dev) return 0xfffffffful; } -/* - * DMA operations are abstracted for G5 vs. i/pSeries, PCI vs. VIO - */ -struct dma_mapping_ops { - void * (*alloc_coherent)(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag); - void (*free_coherent)(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle); - int (*map_sg)(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*unmap_sg)(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs); - int (*dma_supported)(struct device *dev, u64 mask); - int (*set_dma_mask)(struct device *dev, u64 dma_mask); - dma_addr_t (*map_page)(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*unmap_page)(struct device *dev, - dma_addr_t dma_address, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs); -#ifdef CONFIG_PPC_NEED_DMA_SYNC_OPS - void (*sync_single_range_for_cpu)(struct device *hwdev, - dma_addr_t dma_handle, unsigned long offset, - size_t size, - enum dma_data_direction direction); - void (*sync_single_range_for_device)(struct device *hwdev, - dma_addr_t dma_handle, unsigned long offset, - size_t size, - enum dma_data_direction direction); - void (*sync_sg_for_cpu)(struct device *hwdev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction); - void (*sync_sg_for_device)(struct device *hwdev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction); -#endif -}; - /* * Available generic sets of operations */ #ifdef CONFIG_PPC64 -extern struct dma_mapping_ops dma_iommu_ops; +extern struct dma_map_ops dma_iommu_ops; #endif -extern struct dma_mapping_ops dma_direct_ops; +extern struct dma_map_ops dma_direct_ops; -static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { /* We don't handle the NULL dev case for ISA for now. We could * do it via an out of line call but it is not needed for now. The @@ -126,14 +84,14 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) return dev->archdata.dma_ops; } -static inline void set_dma_ops(struct device *dev, struct dma_mapping_ops *ops) +static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) { dev->archdata.dma_ops = ops; } static inline int dma_supported(struct device *dev, u64 mask) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); if (unlikely(dma_ops == NULL)) return 0; @@ -147,7 +105,7 @@ static inline int dma_supported(struct device *dev, u64 mask) static inline int dma_set_mask(struct device *dev, u64 dma_mask) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); if (unlikely(dma_ops == NULL)) return -EIO; @@ -161,7 +119,7 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask) /* * map_/unmap_single actually call through to map/unmap_page now that all the - * dma_mapping_ops have been converted over. We just have to get the page and + * dma_map_ops have been converted over. We just have to get the page and * offset to pass through to map_page */ static inline dma_addr_t dma_map_single_attrs(struct device *dev, @@ -170,7 +128,7 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -185,7 +143,7 @@ static inline void dma_unmap_single_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -198,7 +156,7 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -211,7 +169,7 @@ static inline void dma_unmap_page_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -222,7 +180,7 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); return dma_ops->map_sg(dev, sg, nents, direction, attrs); @@ -234,7 +192,7 @@ static inline void dma_unmap_sg_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); dma_ops->unmap_sg(dev, sg, nhwentries, direction, attrs); @@ -243,7 +201,7 @@ static inline void dma_unmap_sg_attrs(struct device *dev, static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); return dma_ops->alloc_coherent(dev, size, dma_handle, flag); @@ -252,7 +210,7 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size, static inline void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); @@ -304,7 +262,7 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -317,7 +275,7 @@ static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -330,7 +288,7 @@ static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -342,7 +300,7 @@ static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -354,7 +312,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -367,7 +325,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index d9483c504d2..7ae46d7e270 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -61,8 +61,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) } #ifdef CONFIG_PCI -extern void set_pci_dma_ops(struct dma_mapping_ops *dma_ops); -extern struct dma_mapping_ops *get_pci_dma_ops(void); +extern void set_pci_dma_ops(struct dma_map_ops *dma_ops); +extern struct dma_map_ops *get_pci_dma_ops(void); #else /* CONFIG_PCI */ #define set_pci_dma_ops(d) #define get_pci_dma_ops() NULL diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 21ce0a3b494..8979d4cd3d7 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -13,7 +13,7 @@ #include -extern struct dma_mapping_ops swiotlb_dma_ops; +extern struct dma_map_ops swiotlb_dma_ops; static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 2983adac8cc..87ddb3fb948 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -89,7 +89,7 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 1; } -struct dma_mapping_ops dma_iommu_ops = { +struct dma_map_ops dma_iommu_ops = { .alloc_coherent = dma_iommu_alloc_coherent, .free_coherent = dma_iommu_free_coherent, .map_sg = dma_iommu_map_sg, diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index ca141e108ae..d1143a68d82 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -31,7 +31,7 @@ unsigned int ppc_swiotlb_enable; * map_page, and unmap_page on highmem, use normal dma_ops * for everything else. */ -struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_map_ops swiotlb_dma_ops = { .alloc_coherent = dma_direct_alloc_coherent, .free_coherent = dma_direct_free_coherent, .map_sg = swiotlb_map_sg_attrs, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index ccf129d47d8..c61f70e145a 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -140,7 +140,7 @@ static inline void dma_direct_sync_single_range(struct device *dev, } #endif -struct dma_mapping_ops dma_direct_ops = { +struct dma_map_ops dma_direct_ops = { .alloc_coherent = dma_direct_alloc_coherent, .free_coherent = dma_direct_free_coherent, .map_sg = dma_direct_map_sg, diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index 6e3f6249365..a4c8b38b0ba 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -127,7 +127,7 @@ static int ibmebus_dma_supported(struct device *dev, u64 mask) return 1; } -static struct dma_mapping_ops ibmebus_dma_ops = { +static struct dma_map_ops ibmebus_dma_ops = { .alloc_coherent = ibmebus_alloc_coherent, .free_coherent = ibmebus_free_coherent, .map_sg = ibmebus_map_sg, diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 5a56e97c5ac..7585f1fc26d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -50,14 +50,14 @@ resource_size_t isa_mem_base; unsigned int ppc_pci_flags = 0; -static struct dma_mapping_ops *pci_dma_ops = &dma_direct_ops; +static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; -void set_pci_dma_ops(struct dma_mapping_ops *dma_ops) +void set_pci_dma_ops(struct dma_map_ops *dma_ops) { pci_dma_ops = dma_ops; } -struct dma_mapping_ops *get_pci_dma_ops(void) +struct dma_map_ops *get_pci_dma_ops(void) { return pci_dma_ops; } diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 819e59f6f7c..bc7b41edbdf 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -601,7 +601,7 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, vio_cmo_dealloc(viodev, alloc_size); } -struct dma_mapping_ops vio_dma_mapping_ops = { +struct dma_map_ops vio_dma_mapping_ops = { .alloc_coherent = vio_dma_iommu_alloc_coherent, .free_coherent = vio_dma_iommu_free_coherent, .map_sg = vio_dma_iommu_map_sg, diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 5b34fc211f3..416db17eb18 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -642,7 +642,7 @@ static int dma_fixed_dma_supported(struct device *dev, u64 mask) static int dma_set_mask_and_switch(struct device *dev, u64 dma_mask); -struct dma_mapping_ops dma_iommu_fixed_ops = { +struct dma_map_ops dma_iommu_fixed_ops = { .alloc_coherent = dma_fixed_alloc_coherent, .free_coherent = dma_fixed_free_coherent, .map_sg = dma_fixed_map_sg, diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 676f989ed4e..e34b305a7a5 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -694,7 +694,7 @@ static int ps3_dma_supported(struct device *_dev, u64 mask) return mask >= DMA_BIT_MASK(32); } -static struct dma_mapping_ops ps3_sb_dma_ops = { +static struct dma_map_ops ps3_sb_dma_ops = { .alloc_coherent = ps3_alloc_coherent, .free_coherent = ps3_free_coherent, .map_sg = ps3_sb_map_sg, @@ -704,7 +704,7 @@ static struct dma_mapping_ops ps3_sb_dma_ops = { .unmap_page = ps3_unmap_page, }; -static struct dma_mapping_ops ps3_ioc0_dma_ops = { +static struct dma_map_ops ps3_ioc0_dma_ops = { .alloc_coherent = ps3_alloc_coherent, .free_coherent = ps3_free_coherent, .map_sg = ps3_ioc0_map_sg, -- cgit v1.2.3-70-g09d2 From 46bab4e4b45ec522ecd5fa4a0e2b4a6e6d1f153a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:26 +0000 Subject: powerpc: Use asm-generic/dma-mapping-common.h Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 6 +- arch/powerpc/include/asm/dma-mapping.h | 242 +-------------------------------- 2 files changed, 7 insertions(+), 241 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4c0747e8ed7..6078253c6d7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -120,7 +120,7 @@ config PPC select HAVE_KRETPROBES select HAVE_ARCH_TRACEHOOK select HAVE_LMB - select HAVE_DMA_ATTRS if PPC64 + select HAVE_DMA_ATTRS select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 @@ -307,10 +307,6 @@ config SWIOTLB platforms where the size of a physical address is larger than the bus address. Not all platforms support this. -config PPC_NEED_DMA_SYNC_OPS - def_bool y - depends on (NOT_COHERENT_CACHE || SWIOTLB) - config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 8ca2b5183c5..91217e4a0bf 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,11 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) dev->archdata.dma_ops = ops; } +/* this will be removed soon */ +#define flush_write_buffers() + +#include + static inline int dma_supported(struct device *dev, u64 mask) { struct dma_map_ops *dma_ops = get_dma_ops(dev); @@ -117,87 +123,6 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask) return 0; } -/* - * map_/unmap_single actually call through to map/unmap_page now that all the - * dma_map_ops have been converted over. We just have to get the page and - * offset to pass through to map_page - */ -static inline dma_addr_t dma_map_single_attrs(struct device *dev, - void *cpu_addr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - return dma_ops->map_page(dev, virt_to_page(cpu_addr), - (unsigned long)cpu_addr % PAGE_SIZE, size, - direction, attrs); -} - -static inline void dma_unmap_single_attrs(struct device *dev, - dma_addr_t dma_addr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - dma_ops->unmap_page(dev, dma_addr, size, direction, attrs); -} - -static inline dma_addr_t dma_map_page_attrs(struct device *dev, - struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - return dma_ops->map_page(dev, page, offset, size, direction, attrs); -} - -static inline void dma_unmap_page_attrs(struct device *dev, - dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - dma_ops->unmap_page(dev, dma_address, size, direction, attrs); -} - -static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - return dma_ops->map_sg(dev, sg, nents, direction, attrs); -} - -static inline void dma_unmap_sg_attrs(struct device *dev, - struct scatterlist *sg, - int nhwentries, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - dma_ops->unmap_sg(dev, sg, nhwentries, direction, attrs); -} - static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { @@ -216,161 +141,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size, dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); } -static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, - size_t size, - enum dma_data_direction direction) -{ - return dma_map_single_attrs(dev, cpu_addr, size, direction, NULL); -} - -static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, - enum dma_data_direction direction) -{ - dma_unmap_single_attrs(dev, dma_addr, size, direction, NULL); -} - -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - return dma_map_page_attrs(dev, page, offset, size, direction, NULL); -} - -static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction) -{ - dma_unmap_page_attrs(dev, dma_address, size, direction, NULL); -} - -static inline int dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction) -{ - return dma_map_sg_attrs(dev, sg, nents, direction, NULL); -} - -static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nhwentries, - enum dma_data_direction direction) -{ - dma_unmap_sg_attrs(dev, sg, nhwentries, direction, NULL); -} - -#ifdef CONFIG_PPC_NEED_DMA_SYNC_OPS -static inline void dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_cpu) - dma_ops->sync_single_range_for_cpu(dev, dma_handle, 0, - size, direction); -} - -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_device) - dma_ops->sync_single_range_for_device(dev, dma_handle, - 0, size, direction); -} - -static inline void dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_sg_for_cpu) - dma_ops->sync_sg_for_cpu(dev, sgl, nents, direction); -} - -static inline void dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_sg_for_device) - dma_ops->sync_sg_for_device(dev, sgl, nents, direction); -} - -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_cpu) - dma_ops->sync_single_range_for_cpu(dev, dma_handle, - offset, size, direction); -} - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_device) - dma_ops->sync_single_range_for_device(dev, dma_handle, offset, - size, direction); -} -#else /* CONFIG_PPC_NEED_DMA_SYNC_OPS */ -static inline void dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} -#endif - static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { #ifdef CONFIG_PPC64 -- cgit v1.2.3-70-g09d2 From 4a9a6bfe707cfe5bcb0a20eabe240293a095cd10 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:27 +0000 Subject: powerpc: Handle SWIOTLB mapping error properly Signed-off-by: FUJITA Tomonori Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/dma-mapping.h | 5 +++++ arch/powerpc/kernel/dma-swiotlb.c | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 91217e4a0bf..4bd41b4051e 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -143,6 +143,11 @@ static inline void dma_free_coherent(struct device *dev, size_t size, static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops->mapping_error) + return dma_ops->mapping_error(dev, dma_addr); + #ifdef CONFIG_PPC64 return (dma_addr == DMA_ERROR_CODE); #else diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index d1143a68d82..e96cbbd9b44 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -42,7 +42,8 @@ struct dma_map_ops swiotlb_dma_ops = { .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .mapping_error = swiotlb_dma_mapping_error, }; void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) -- cgit v1.2.3-70-g09d2 From 80d3e8abb73dad3983fef2597b52cab8fbcd876b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:28 +0000 Subject: powerpc: Add CONFIG_DMA_API_DEBUG support Signed-off-by: FUJITA Tomonori Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/dma-mapping.h | 11 ++++++++++- arch/powerpc/kernel/dma.c | 11 +++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6078253c6d7..9e03991dc87 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -121,6 +121,7 @@ config PPC select HAVE_ARCH_TRACEHOOK select HAVE_LMB select HAVE_DMA_ATTRS + select HAVE_DMA_API_DEBUG select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 4bd41b4051e..cb2ca41dd52 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -127,9 +127,15 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { struct dma_map_ops *dma_ops = get_dma_ops(dev); + void *cpu_addr; BUG_ON(!dma_ops); - return dma_ops->alloc_coherent(dev, size, dma_handle, flag); + + cpu_addr = dma_ops->alloc_coherent(dev, size, dma_handle, flag); + + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + + return cpu_addr; } static inline void dma_free_coherent(struct device *dev, size_t size, @@ -138,6 +144,9 @@ static inline void dma_free_coherent(struct device *dev, size_t size, struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); + + debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); + dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); } diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index c61f70e145a..21b784d7e7d 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -156,3 +157,13 @@ struct dma_map_ops dma_direct_ops = { #endif }; EXPORT_SYMBOL(dma_direct_ops); + +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) + +static int __init dma_init(void) +{ + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + + return 0; +} +fs_initcall(dma_init); -- cgit v1.2.3-70-g09d2 From e3e1d15855206c85f4c9ed82746e81acfe13e5aa Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Mon, 24 Aug 2009 06:15:36 +0000 Subject: powerpc: Name xpn & x fields in HW Hash PTE format Previously, the 36-bit code was using these bits, but they had never been named in the pte format definition. This patch just gives those fields their proper names and adds a comment that they are only present on some processors. There is no functional code change. Signed-off-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-hash32.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu-hash32.h b/arch/powerpc/include/asm/mmu-hash32.h index 382fc689f20..16f513e5cbd 100644 --- a/arch/powerpc/include/asm/mmu-hash32.h +++ b/arch/powerpc/include/asm/mmu-hash32.h @@ -55,21 +55,25 @@ struct ppc_bat { #ifndef __ASSEMBLY__ -/* Hardware Page Table Entry */ +/* + * Hardware Page Table Entry + * Note that the xpn and x bitfields are used only by processors that + * support extended addressing; otherwise, those bits are reserved. + */ struct hash_pte { unsigned long v:1; /* Entry is valid */ unsigned long vsid:24; /* Virtual segment identifier */ unsigned long h:1; /* Hash algorithm indicator */ unsigned long api:6; /* Abbreviated page index */ unsigned long rpn:20; /* Real (physical) page number */ - unsigned long :3; /* Unused */ + unsigned long xpn:3; /* Real page number bits 0-2, optional */ unsigned long r:1; /* Referenced */ unsigned long c:1; /* Changed */ unsigned long w:1; /* Write-thru cache mode */ unsigned long i:1; /* Cache inhibited */ unsigned long m:1; /* Memory coherence */ unsigned long g:1; /* Guarded */ - unsigned long :1; /* Unused */ + unsigned long x:1; /* Real page number bit 3, optional */ unsigned long pp:2; /* Page protection */ }; -- cgit v1.2.3-70-g09d2 From 23e55f92d4fd733365dd572ea6e9e211387123c2 Mon Sep 17 00:00:00 2001 From: Michael Wolf Date: Thu, 20 Aug 2009 13:21:45 +0000 Subject: powerpc: Adjust base and index registers in Altivec macros On POWER6 systems RA needs to be the base and RB the index. If they are reversed you take a misdirect hit. Signed-off-by: Mike Wolf ---- Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/ppc_asm.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index dfae6e916df..498fe09263d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -98,13 +98,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ #define REST_16FPRS(n, base) REST_8FPRS(n, base); REST_8FPRS(n+8, base) #define REST_32FPRS(n, base) REST_16FPRS(n, base); REST_16FPRS(n+16, base) -#define SAVE_VR(n,b,base) li b,THREAD_VR0+(16*(n)); stvx n,b,base +#define SAVE_VR(n,b,base) li b,THREAD_VR0+(16*(n)); stvx n,base,b #define SAVE_2VRS(n,b,base) SAVE_VR(n,b,base); SAVE_VR(n+1,b,base) #define SAVE_4VRS(n,b,base) SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,base) #define SAVE_8VRS(n,b,base) SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,base) #define SAVE_16VRS(n,b,base) SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,base) #define SAVE_32VRS(n,b,base) SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b,base) -#define REST_VR(n,b,base) li b,THREAD_VR0+(16*(n)); lvx n,b,base +#define REST_VR(n,b,base) li b,THREAD_VR0+(16*(n)); lvx n,base,b #define REST_2VRS(n,b,base) REST_VR(n,b,base); REST_VR(n+1,b,base) #define REST_4VRS(n,b,base) REST_2VRS(n,b,base); REST_2VRS(n+2,b,base) #define REST_8VRS(n,b,base) REST_4VRS(n,b,base); REST_4VRS(n+4,b,base) @@ -112,26 +112,26 @@ END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ #define REST_32VRS(n,b,base) REST_16VRS(n,b,base); REST_16VRS(n+16,b,base) /* Save the lower 32 VSRs in the thread VSR region */ -#define SAVE_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); STXVD2X(n,b,base) +#define SAVE_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); STXVD2X(n,base,b) #define SAVE_2VSRS(n,b,base) SAVE_VSR(n,b,base); SAVE_VSR(n+1,b,base) #define SAVE_4VSRS(n,b,base) SAVE_2VSRS(n,b,base); SAVE_2VSRS(n+2,b,base) #define SAVE_8VSRS(n,b,base) SAVE_4VSRS(n,b,base); SAVE_4VSRS(n+4,b,base) #define SAVE_16VSRS(n,b,base) SAVE_8VSRS(n,b,base); SAVE_8VSRS(n+8,b,base) #define SAVE_32VSRS(n,b,base) SAVE_16VSRS(n,b,base); SAVE_16VSRS(n+16,b,base) -#define REST_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); LXVD2X(n,b,base) +#define REST_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); LXVD2X(n,base,b) #define REST_2VSRS(n,b,base) REST_VSR(n,b,base); REST_VSR(n+1,b,base) #define REST_4VSRS(n,b,base) REST_2VSRS(n,b,base); REST_2VSRS(n+2,b,base) #define REST_8VSRS(n,b,base) REST_4VSRS(n,b,base); REST_4VSRS(n+4,b,base) #define REST_16VSRS(n,b,base) REST_8VSRS(n,b,base); REST_8VSRS(n+8,b,base) #define REST_32VSRS(n,b,base) REST_16VSRS(n,b,base); REST_16VSRS(n+16,b,base) /* Save the upper 32 VSRs (32-63) in the thread VSX region (0-31) */ -#define SAVE_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); STXVD2X(n+32,b,base) +#define SAVE_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); STXVD2X(n+32,base,b) #define SAVE_2VSRSU(n,b,base) SAVE_VSRU(n,b,base); SAVE_VSRU(n+1,b,base) #define SAVE_4VSRSU(n,b,base) SAVE_2VSRSU(n,b,base); SAVE_2VSRSU(n+2,b,base) #define SAVE_8VSRSU(n,b,base) SAVE_4VSRSU(n,b,base); SAVE_4VSRSU(n+4,b,base) #define SAVE_16VSRSU(n,b,base) SAVE_8VSRSU(n,b,base); SAVE_8VSRSU(n+8,b,base) #define SAVE_32VSRSU(n,b,base) SAVE_16VSRSU(n,b,base); SAVE_16VSRSU(n+16,b,base) -#define REST_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); LXVD2X(n+32,b,base) +#define REST_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); LXVD2X(n+32,base,b) #define REST_2VSRSU(n,b,base) REST_VSRU(n,b,base); REST_VSRU(n+1,b,base) #define REST_4VSRSU(n,b,base) REST_2VSRSU(n,b,base); REST_2VSRSU(n+2,b,base) #define REST_8VSRSU(n,b,base) REST_4VSRSU(n,b,base); REST_4VSRSU(n+4,b,base) -- cgit v1.2.3-70-g09d2 From df5d6ecf8157245ef733db87597adb2c6e2510da Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Mon, 24 Aug 2009 15:52:48 +0000 Subject: powerpc/mm: Add MMU features for TLB reservation & Paired MAS registers Support for TLB reservation (or TLB Write Conditional) and Paired MAS registers are optional for a processor implementation so we handle them via MMU feature sections. We currently only used paired MAS registers to access the full RPN + perm bits that are kept in MAS7||MAS3. We assume that if an implementation has hardware page table at this time it also implements in TLB reservations. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu.h | 9 +++++++++ arch/powerpc/mm/tlb_low_64e.S | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 2fcfefc6089..7ffbb65ff7a 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -58,6 +58,15 @@ */ #define MMU_FTR_TLBIE_206 ASM_CONST(0x00400000) +/* Enable use of TLB reservation. Processor should support tlbsrx. + * instruction and MAS0[WQ]. + */ +#define MMU_FTR_USE_TLBRSRV ASM_CONST(0x00800000) + +/* Use paired MAS registers (MAS7||MAS3, etc.) + */ +#define MMU_FTR_USE_PAIRED_MAS ASM_CONST(0x01000000) + #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index cd92f62f9cf..ef1cccf7117 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -189,12 +189,16 @@ normal_tlb_miss: clrrdi r14,r14,3 or r10,r15,r14 +BEGIN_MMU_FTR_SECTION /* Set the TLB reservation and seach for existing entry. Then load * the entry. */ PPC_TLBSRX_DOT(0,r16) ld r14,0(r10) beq normal_tlb_miss_done +MMU_FTR_SECTION_ELSE + ld r14,0(r10) +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) finish_normal_tlb_miss: /* Check if required permissions are met */ @@ -241,7 +245,14 @@ finish_normal_tlb_miss: bne 1f li r11,MAS3_SW|MAS3_UW andc r15,r15,r11 -1: mtspr SPRN_MAS7_MAS3,r15 +1: +BEGIN_MMU_FTR_SECTION + srdi r16,r15,32 + mtspr SPRN_MAS3,r15 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r15 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -311,11 +322,13 @@ virt_page_table_tlb_miss: rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 1: +BEGIN_MMU_FTR_SECTION /* Search if we already have a TLB entry for that virtual address, and * if we do, bail out. */ PPC_TLBSRX_DOT(0,r16) beq virt_page_table_tlb_miss_done +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Now, we need to walk the page tables. First check if we are in * range. @@ -367,10 +380,18 @@ virt_page_table_tlb_miss: */ clrldi r11,r15,4 /* remove region ID from RPN */ ori r10,r11,1 /* Or-in SR */ + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe +BEGIN_MMU_FTR_SECTION virt_page_table_tlb_miss_done: /* We have overriden MAS2:EPN but currently our primary TLB miss @@ -394,6 +415,7 @@ virt_page_table_tlb_miss_done: addi r10,r11,-4 std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) 1: +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Return to caller, normal case */ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); TLB_MISS_EPILOG_SUCCESS @@ -618,7 +640,14 @@ htw_tlb_miss: #else ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) #endif + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -700,7 +729,14 @@ tlb_load_linear: clrrdi r10,r16,30 /* 1G page index */ clrldi r10,r10,4 /* clear region bits */ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe -- cgit v1.2.3-70-g09d2 From 4b98d9e713a03bd79ced8800e24a56359f9effbf Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 19:08:32 +0000 Subject: powerpc/book3e-64: Add helper function to setup IVORs Not all 64-bit Book-3E parts will have fixed IVORs so add a function that cpusetup code can call to setup the base IVORs (0..15) to match the fixed offsets. We need to 'or' part of interrupt_base_book3e into the IVORs since on parts that have them the IVPR doesn't extend as far down. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64e.h | 4 ++++ arch/powerpc/kernel/exceptions-64e.S | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index 94cb3d79d12..6d53f311d94 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -196,6 +196,10 @@ exc_##label##_book3e: #define TLB_MISS_STATS_SAVE_INFO #endif +#define SET_IVOR(vector_number, vector_offset) \ + li r3,vector_offset@l; \ + ori r3,r3,interrupt_base_book3e@l; \ + mtspr SPRN_IVOR##vector_number,r3; #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 3611b0e7d46..662236c7224 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -782,5 +782,24 @@ _STATIC(init_thread_book3e) blr +_GLOBAL(__setup_base_ivors) + SET_IVOR(0, 0x020) /* Critical Input */ + SET_IVOR(1, 0x000) /* Machine Check */ + SET_IVOR(2, 0x060) /* Data Storage */ + SET_IVOR(3, 0x080) /* Instruction Storage */ + SET_IVOR(4, 0x0a0) /* External Input */ + SET_IVOR(5, 0x0c0) /* Alignment */ + SET_IVOR(6, 0x0e0) /* Program */ + SET_IVOR(7, 0x100) /* FP Unavailable */ + SET_IVOR(8, 0x120) /* System Call */ + SET_IVOR(9, 0x140) /* Auxiliary Processor Unavailable */ + SET_IVOR(10, 0x160) /* Decrementer */ + SET_IVOR(11, 0x180) /* Fixed Interval Timer */ + SET_IVOR(12, 0x1a0) /* Watchdog Timer */ + SET_IVOR(13, 0x1c0) /* Data TLB Error */ + SET_IVOR(14, 0x1e0) /* Instruction TLB Error */ + SET_IVOR(15, 0x040) /* Debug */ + sync + blr -- cgit v1.2.3-70-g09d2 From bb1af71ecbfdbecbe9f7e43f703da5840b76c2e4 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 19:08:33 +0000 Subject: powerpc/book3e-64: Add support to initial_tlb_book3e for non-HES TLB We now search through TLBnCFG looking for the first array that has IPROT support (we assume that there is only one). If that TLB has hardware entry select (HES) support we use the existing code and with the proper TLB select (the HES code still needs to clean up bolted entries from firmware). The non-HES code is pretty similiar to the 32-bit FSL Book-E code but does make some new assumtions (like that we have tlbilx) and simplifies things down a bit. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/reg_booke.h | 2 + arch/powerpc/kernel/exceptions-64e.S | 204 ++++++++++++++++++++++++++++++++++- 2 files changed, 202 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 9bb81d99b76..3bf78350552 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -108,6 +108,8 @@ #define SPRN_PID2 0x27A /* Process ID Register 2 */ #define SPRN_TLB0CFG 0x2B0 /* TLB 0 Config Register */ #define SPRN_TLB1CFG 0x2B1 /* TLB 1 Config Register */ +#define SPRN_TLB2CFG 0x2B2 /* TLB 2 Config Register */ +#define SPRN_TLB3CFG 0x2B3 /* TLB 3 Config Register */ #define SPRN_EPR 0x2BE /* External Proxy Register */ #define SPRN_CCR1 0x378 /* Core Configuration Register 1 */ #define SPRN_ZPR 0x3B0 /* Zone Protection Register (40x) */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 662236c7224..9048f96237f 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -616,18 +616,214 @@ bad_stack_book3e: * Setup the initial TLB for a core. This current implementation * assume that whatever we are running off will not conflict with * the new mapping at PAGE_OFFSET. - * We also make various assumptions about the processor we run on, - * this might have to be made more flexible based on the content - * of MMUCFG and friends. */ _GLOBAL(initial_tlb_book3e) + /* Look for the first TLB with IPROT set */ + mfspr r4,SPRN_TLB0CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(0)@h + bne found_iprot + + mfspr r4,SPRN_TLB1CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(1)@h + bne found_iprot + + mfspr r4,SPRN_TLB2CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(2)@h + bne found_iprot + + lis r3,MAS0_TLBSEL(3)@h + mfspr r4,SPRN_TLB3CFG + /* fall through */ + +found_iprot: + andi. r5,r4,TLBnCFG_HES + bne have_hes + + mflr r8 /* save LR */ +/* 1. Find the index of the entry we're executing in + * + * r3 = MAS0_TLBSEL (for the iprot array) + * r4 = SPRN_TLBnCFG + */ + bl invstr /* Find our address */ +invstr: mflr r6 /* Make it accessible */ + mfmsr r7 + rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ + mfspr r7,SPRN_PID + slwi r7,r7,16 + or r7,r7,r5 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* search MSR[IS], SPID=PID */ + + mfspr r3,SPRN_MAS0 + rlwinm r5,r3,16,20,31 /* Extract MAS0(Entry) */ + + mfspr r7,SPRN_MAS1 /* Insure IPROT set */ + oris r7,r7,MAS1_IPROT@h + mtspr SPRN_MAS1,r7 + tlbwe + +/* 2. Invalidate all entries except the entry we're executing in + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in + * r4 = SPRN_TLBnCFG + * r5 = ESEL of entry we are running in + */ + andi. r4,r4,TLBnCFG_N_ENTRY /* Extract # entries */ + li r6,0 /* Set Entry counter to 0 */ +1: mr r7,r3 /* Set MAS0(TLBSEL) */ + rlwimi r7,r6,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r7,SPRN_MAS1 + rlwinm r7,r7,0,2,31 /* Clear MAS1 Valid and IPROT */ + cmpw r5,r6 + beq skpinv /* Dont update the current execution TLB */ + mtspr SPRN_MAS1,r7 + tlbwe + isync +skpinv: addi r6,r6,1 /* Increment */ + cmpw r6,r4 /* Are we done? */ + bne 1b /* If not, repeat */ + + /* Invalidate all TLBs */ + PPC_TLBILX_ALL(0,0) + sync + isync + +/* 3. Setup a temp mapping and jump to it + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in + * r5 = ESEL of entry we are running in + */ + andi. r7,r5,0x1 /* Find an entry not used and is non-zero */ + addi r7,r7,0x1 + mr r4,r3 /* Set MAS0(TLBSEL) = 1 */ + mtspr SPRN_MAS0,r4 + tlbre + + rlwimi r4,r7,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r7) */ + mtspr SPRN_MAS0,r4 + + mfspr r7,SPRN_MAS1 + xori r6,r7,MAS1_TS /* Setup TMP mapping in the other Address space */ + mtspr SPRN_MAS1,r6 + + tlbwe + + mfmsr r6 + xori r6,r6,MSR_IS + mtspr SPRN_SRR1,r6 + bl 1f /* Find our address */ +1: mflr r6 + addi r6,r6,(2f - 1b) + mtspr SPRN_SRR0,r6 + rfi +2: + +/* 4. Clear out PIDs & Search info + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + li r6,0 + mtspr SPRN_MAS6,r6 + mtspr SPRN_PID,r6 + +/* 5. Invalidate mapping we started in + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + mtspr SPRN_MAS0,r3 + tlbre + mfspr r6,SPRN_MAS1 + rlwinm r6,r6,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r6 + tlbwe + + /* Invalidate TLB1 */ + PPC_TLBILX_ALL(0,0) + sync + isync + +/* The mapping only needs to be cache-coherent on SMP */ +#ifdef CONFIG_SMP +#define M_IF_SMP MAS2_M +#else +#define M_IF_SMP 0 +#endif + +/* 6. Setup KERNELBASE mapping in TLB[0] + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + rlwinm r3,r3,0,16,3 /* clear ESEL */ + mtspr SPRN_MAS0,r3 + lis r6,(MAS1_VALID|MAS1_IPROT)@h + ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l + mtspr SPRN_MAS1,r6 + + LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET | M_IF_SMP) + mtspr SPRN_MAS2,r6 + + rlwinm r5,r5,0,0,25 + ori r5,r5,MAS3_SR | MAS3_SW | MAS3_SX + mtspr SPRN_MAS3,r5 + li r5,-1 + rlwinm r5,r5,0,0,25 + + tlbwe + +/* 7. Jump to KERNELBASE mapping + * + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + */ + /* Now we branch the new virtual address mapped by this entry */ + LOAD_REG_IMMEDIATE(r6,2f) + lis r7,MSR_KERNEL@h + ori r7,r7,MSR_KERNEL@l + mtspr SPRN_SRR0,r6 + mtspr SPRN_SRR1,r7 + rfi /* start execution out of TLB1[0] entry */ +2: + +/* 8. Clear out the temp mapping + * + * r4 = MAS0 w/TLBSEL & ESEL for the entry we are running in + */ + mtspr SPRN_MAS0,r4 + tlbre + mfspr r5,SPRN_MAS1 + rlwinm r5,r5,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r5 + tlbwe + + /* Invalidate TLB1 */ + PPC_TLBILX_ALL(0,0) + sync + isync + + /* We translate LR and return */ + tovirt(r8,r8) + mtlr r8 + blr + +have_hes: /* Setup MAS 0,1,2,3 and 7 for tlbwe of a 1G entry that maps the * kernel linear mapping. We also set MAS8 once for all here though * that will have to be made dependent on whether we are running under * a hypervisor I suppose. */ - li r3,MAS0_HES | MAS0_WQ_ALLWAYS + ori r3,r3,MAS0_HES | MAS0_WQ_ALLWAYS mtspr SPRN_MAS0,r3 lis r3,(MAS1_VALID | MAS1_IPROT)@h ori r3,r3,BOOK3E_PAGESZ_1GB << MAS1_TSIZE_SHIFT -- cgit v1.2.3-70-g09d2 From fbe65447197789a3ccccc27755956f6a4c445089 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 25 Aug 2009 20:07:11 +0000 Subject: powerpc/pci: move pci_64.c device tree scanning code into pci-common.c The PCI device tree scanning code in pci_64.c is some useful functionality. It allows PCI devices to be described in the device tree instead of being probed for, which in turn allows pci devices to use all of the device tree facilities to describe complex PCI bus architectures like GPIO and IRQ routing (perhaps not a common situation for desktop or server systems, but useful for embedded systems with on-board PCI devices). This patch moves the device tree scanning into pci-common.c so it is available for 32-bit powerpc machines too. Signed-off-by: Grant Likely Acked-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pci-bridge.h | 5 - arch/powerpc/include/asm/pci.h | 5 + arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/pci-common.c | 1 - arch/powerpc/kernel/pci_64.c | 289 --------------------------- arch/powerpc/kernel/pci_of_scan.c | 358 ++++++++++++++++++++++++++++++++++ 6 files changed, 364 insertions(+), 296 deletions(-) create mode 100644 arch/powerpc/kernel/pci_of_scan.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 4c61fa0b8d7..3faf575f6b0 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -284,11 +284,6 @@ static inline int isa_vaddr_is_ioport(void __iomem *address) extern int pcibios_unmap_io_space(struct pci_bus *bus); extern int pcibios_map_io_space(struct pci_bus *bus); -/* Return values for ppc_md.pci_probe_mode function */ -#define PCI_PROBE_NONE -1 /* Don't look at this bus at all */ -#define PCI_PROBE_NORMAL 0 /* Do normal PCI probing */ -#define PCI_PROBE_DEVTREE 1 /* Instantiate from device tree */ - #ifdef CONFIG_NUMA #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) #else diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 7ae46d7e270..b856a837b4a 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -22,6 +22,11 @@ #include +/* Return values for ppc_md.pci_probe_mode function */ +#define PCI_PROBE_NONE -1 /* Don't look at this bus at all */ +#define PCI_PROBE_NORMAL 0 /* Do normal PCI probing */ +#define PCI_PROBE_DEVTREE 1 /* Instantiate from device tree */ + #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM 0x10000000 diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 7c83edbc215..569f79ccd31 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -88,7 +88,7 @@ obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \ - pci-common.o + pci-common.o pci_of_scan.o obj-$(CONFIG_PCI_MSI) += msi.o obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o \ machine_kexec_$(CONFIG_WORD_SIZE).o diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 158a78ae634..725ea9144e3 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1617,4 +1617,3 @@ void __devinit pcibios_setup_phb_resources(struct pci_controller *hose) (unsigned long)hose->io_base_virt - _IO_BASE); } - diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 9e8902fa14c..4d5b4ced7e4 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -43,295 +43,6 @@ unsigned long pci_probe_only = 1; unsigned long pci_io_base = ISA_IO_BASE; EXPORT_SYMBOL(pci_io_base); -static u32 get_int_prop(struct device_node *np, const char *name, u32 def) -{ - const u32 *prop; - int len; - - prop = of_get_property(np, name, &len); - if (prop && len >= 4) - return *prop; - return def; -} - -static unsigned int pci_parse_of_flags(u32 addr0, int bridge) -{ - unsigned int flags = 0; - - if (addr0 & 0x02000000) { - flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; - flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; - flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; - if (addr0 & 0x40000000) - flags |= IORESOURCE_PREFETCH - | PCI_BASE_ADDRESS_MEM_PREFETCH; - /* Note: We don't know whether the ROM has been left enabled - * by the firmware or not. We mark it as disabled (ie, we do - * not set the IORESOURCE_ROM_ENABLE flag) for now rather than - * do a config space read, it will be force-enabled if needed - */ - if (!bridge && (addr0 & 0xff) == 0x30) - flags |= IORESOURCE_READONLY; - } else if (addr0 & 0x01000000) - flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; - if (flags) - flags |= IORESOURCE_SIZEALIGN; - return flags; -} - - -static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev) -{ - u64 base, size; - unsigned int flags; - struct resource *res; - const u32 *addrs; - u32 i; - int proplen; - - addrs = of_get_property(node, "assigned-addresses", &proplen); - if (!addrs) - return; - pr_debug(" parse addresses (%d bytes) @ %p\n", proplen, addrs); - for (; proplen >= 20; proplen -= 20, addrs += 5) { - flags = pci_parse_of_flags(addrs[0], 0); - if (!flags) - continue; - base = of_read_number(&addrs[1], 2); - size = of_read_number(&addrs[3], 2); - if (!size) - continue; - i = addrs[0] & 0xff; - pr_debug(" base: %llx, size: %llx, i: %x\n", - (unsigned long long)base, - (unsigned long long)size, i); - - if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { - res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; - } else if (i == dev->rom_base_reg) { - res = &dev->resource[PCI_ROM_RESOURCE]; - flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; - } else { - printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); - continue; - } - res->start = base; - res->end = base + size - 1; - res->flags = flags; - res->name = pci_name(dev); - } -} - -struct pci_dev *of_create_pci_dev(struct device_node *node, - struct pci_bus *bus, int devfn) -{ - struct pci_dev *dev; - const char *type; - - dev = alloc_pci_dev(); - if (!dev) - return NULL; - type = of_get_property(node, "device_type", NULL); - if (type == NULL) - type = ""; - - pr_debug(" create device, devfn: %x, type: %s\n", devfn, type); - - dev->bus = bus; - dev->sysdata = node; - dev->dev.parent = bus->bridge; - dev->dev.bus = &pci_bus_type; - dev->devfn = devfn; - dev->multifunction = 0; /* maybe a lie? */ - - dev->vendor = get_int_prop(node, "vendor-id", 0xffff); - dev->device = get_int_prop(node, "device-id", 0xffff); - dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); - dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); - - dev->cfg_size = pci_cfg_space_size(dev); - - dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(bus), - dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); - dev->class = get_int_prop(node, "class-code", 0); - dev->revision = get_int_prop(node, "revision-id", 0); - - pr_debug(" class: 0x%x\n", dev->class); - pr_debug(" revision: 0x%x\n", dev->revision); - - dev->current_state = 4; /* unknown power state */ - dev->error_state = pci_channel_io_normal; - dev->dma_mask = 0xffffffff; - - if (!strcmp(type, "pci") || !strcmp(type, "pciex")) { - /* a PCI-PCI bridge */ - dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; - dev->rom_base_reg = PCI_ROM_ADDRESS1; - } else if (!strcmp(type, "cardbus")) { - dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; - } else { - dev->hdr_type = PCI_HEADER_TYPE_NORMAL; - dev->rom_base_reg = PCI_ROM_ADDRESS; - /* Maybe do a default OF mapping here */ - dev->irq = NO_IRQ; - } - - pci_parse_of_addrs(node, dev); - - pr_debug(" adding to system ...\n"); - - pci_device_add(dev, bus); - - return dev; -} -EXPORT_SYMBOL(of_create_pci_dev); - -static void __devinit __of_scan_bus(struct device_node *node, - struct pci_bus *bus, int rescan_existing) -{ - struct device_node *child; - const u32 *reg; - int reglen, devfn; - struct pci_dev *dev; - - pr_debug("of_scan_bus(%s) bus no %d... \n", - node->full_name, bus->number); - - /* Scan direct children */ - for_each_child_of_node(node, child) { - pr_debug(" * %s\n", child->full_name); - reg = of_get_property(child, "reg", ®len); - if (reg == NULL || reglen < 20) - continue; - devfn = (reg[0] >> 8) & 0xff; - - /* create a new pci_dev for this device */ - dev = of_create_pci_dev(child, bus, devfn); - if (!dev) - continue; - pr_debug(" dev header type: %x\n", dev->hdr_type); - } - - /* Apply all fixups necessary. We don't fixup the bus "self" - * for an existing bridge that is being rescanned - */ - if (!rescan_existing) - pcibios_setup_bus_self(bus); - pcibios_setup_bus_devices(bus); - - /* Now scan child busses */ - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || - dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { - struct device_node *child = pci_device_to_OF_node(dev); - if (dev) - of_scan_pci_bridge(child, dev); - } - } -} - -void __devinit of_scan_bus(struct device_node *node, - struct pci_bus *bus) -{ - __of_scan_bus(node, bus, 0); -} -EXPORT_SYMBOL_GPL(of_scan_bus); - -void __devinit of_rescan_bus(struct device_node *node, - struct pci_bus *bus) -{ - __of_scan_bus(node, bus, 1); -} -EXPORT_SYMBOL_GPL(of_rescan_bus); - -void __devinit of_scan_pci_bridge(struct device_node *node, - struct pci_dev *dev) -{ - struct pci_bus *bus; - const u32 *busrange, *ranges; - int len, i, mode; - struct resource *res; - unsigned int flags; - u64 size; - - pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); - - /* parse bus-range property */ - busrange = of_get_property(node, "bus-range", &len); - if (busrange == NULL || len != 8) { - printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", - node->full_name); - return; - } - ranges = of_get_property(node, "ranges", &len); - if (ranges == NULL) { - printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", - node->full_name); - return; - } - - bus = pci_add_new_bus(dev->bus, dev, busrange[0]); - if (!bus) { - printk(KERN_ERR "Failed to create pci bus for %s\n", - node->full_name); - return; - } - - bus->primary = dev->bus->number; - bus->subordinate = busrange[1]; - bus->bridge_ctl = 0; - bus->sysdata = node; - - /* parse ranges property */ - /* PCI #address-cells == 3 and #size-cells == 2 always */ - res = &dev->resource[PCI_BRIDGE_RESOURCES]; - for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { - res->flags = 0; - bus->resource[i] = res; - ++res; - } - i = 1; - for (; len >= 32; len -= 32, ranges += 8) { - flags = pci_parse_of_flags(ranges[0], 1); - size = of_read_number(&ranges[6], 2); - if (flags == 0 || size == 0) - continue; - if (flags & IORESOURCE_IO) { - res = bus->resource[0]; - if (res->flags) { - printk(KERN_ERR "PCI: ignoring extra I/O range" - " for bridge %s\n", node->full_name); - continue; - } - } else { - if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { - printk(KERN_ERR "PCI: too many memory ranges" - " for bridge %s\n", node->full_name); - continue; - } - res = bus->resource[i]; - ++i; - } - res->start = of_read_number(&ranges[1], 2); - res->end = res->start + size - 1; - res->flags = flags; - } - sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), - bus->number); - pr_debug(" bus name: %s\n", bus->name); - - mode = PCI_PROBE_NORMAL; - if (ppc_md.pci_probe_mode) - mode = ppc_md.pci_probe_mode(bus); - pr_debug(" probe mode: %d\n", mode); - - if (mode == PCI_PROBE_DEVTREE) - of_scan_bus(node, bus); - else if (mode == PCI_PROBE_NORMAL) - pci_scan_child_bus(bus); -} -EXPORT_SYMBOL(of_scan_pci_bridge); - void __devinit scan_phb(struct pci_controller *hose) { struct pci_bus *bus; diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c new file mode 100644 index 00000000000..72c31bcb7aa --- /dev/null +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -0,0 +1,358 @@ +/* + * Helper routines to scan the device tree for PCI devices and busses + * + * Migrated out of PowerPC architecture pci_64.c file by Grant Likely + * so that these routines are available for + * 32 bit also. + * + * Copyright (C) 2003 Anton Blanchard , IBM + * Rework, based on alpha PCI code. + * Copyright (c) 2009 Secret Lab Technologies Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include + +/** + * get_int_prop - Decode a u32 from a device tree property + */ +static u32 get_int_prop(struct device_node *np, const char *name, u32 def) +{ + const u32 *prop; + int len; + + prop = of_get_property(np, name, &len); + if (prop && len >= 4) + return *prop; + return def; +} + +/** + * pci_parse_of_flags - Parse the flags cell of a device tree PCI address + * @addr0: value of 1st cell of a device tree PCI address. + * @bridge: Set this flag if the address is from a bridge 'ranges' property + */ +unsigned int pci_parse_of_flags(u32 addr0, int bridge) +{ + unsigned int flags = 0; + + if (addr0 & 0x02000000) { + flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; + flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; + flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x40000000) + flags |= IORESOURCE_PREFETCH + | PCI_BASE_ADDRESS_MEM_PREFETCH; + /* Note: We don't know whether the ROM has been left enabled + * by the firmware or not. We mark it as disabled (ie, we do + * not set the IORESOURCE_ROM_ENABLE flag) for now rather than + * do a config space read, it will be force-enabled if needed + */ + if (!bridge && (addr0 & 0xff) == 0x30) + flags |= IORESOURCE_READONLY; + } else if (addr0 & 0x01000000) + flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + if (flags) + flags |= IORESOURCE_SIZEALIGN; + return flags; +} + +/** + * of_pci_parse_addrs - Parse PCI addresses assigned in the device tree node + * @node: device tree node for the PCI device + * @dev: pci_dev structure for the device + * + * This function parses the 'assigned-addresses' property of a PCI devices' + * device tree node and writes them into the associated pci_dev structure. + */ +static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) +{ + u64 base, size; + unsigned int flags; + struct resource *res; + const u32 *addrs; + u32 i; + int proplen; + + addrs = of_get_property(node, "assigned-addresses", &proplen); + if (!addrs) + return; + pr_debug(" parse addresses (%d bytes) @ %p\n", proplen, addrs); + for (; proplen >= 20; proplen -= 20, addrs += 5) { + flags = pci_parse_of_flags(addrs[0], 0); + if (!flags) + continue; + base = of_read_number(&addrs[1], 2); + size = of_read_number(&addrs[3], 2); + if (!size) + continue; + i = addrs[0] & 0xff; + pr_debug(" base: %llx, size: %llx, i: %x\n", + (unsigned long long)base, + (unsigned long long)size, i); + + if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { + res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; + } else if (i == dev->rom_base_reg) { + res = &dev->resource[PCI_ROM_RESOURCE]; + flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; + } else { + printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); + continue; + } + res->start = base; + res->end = base + size - 1; + res->flags = flags; + res->name = pci_name(dev); + } +} + +/** + * of_create_pci_dev - Given a device tree node on a pci bus, create a pci_dev + * @node: device tree node pointer + * @bus: bus the device is sitting on + * @devfn: PCI function number, extracted from device tree by caller. + */ +struct pci_dev *of_create_pci_dev(struct device_node *node, + struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + const char *type; + + dev = alloc_pci_dev(); + if (!dev) + return NULL; + type = of_get_property(node, "device_type", NULL); + if (type == NULL) + type = ""; + + pr_debug(" create device, devfn: %x, type: %s\n", devfn, type); + + dev->bus = bus; + dev->sysdata = node; + dev->dev.parent = bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->devfn = devfn; + dev->multifunction = 0; /* maybe a lie? */ + + dev->vendor = get_int_prop(node, "vendor-id", 0xffff); + dev->device = get_int_prop(node, "device-id", 0xffff); + dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); + dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); + + dev->cfg_size = pci_cfg_space_size(dev); + + dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(bus), + dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); + dev->class = get_int_prop(node, "class-code", 0); + dev->revision = get_int_prop(node, "revision-id", 0); + + pr_debug(" class: 0x%x\n", dev->class); + pr_debug(" revision: 0x%x\n", dev->revision); + + dev->current_state = 4; /* unknown power state */ + dev->error_state = pci_channel_io_normal; + dev->dma_mask = 0xffffffff; + + if (!strcmp(type, "pci") || !strcmp(type, "pciex")) { + /* a PCI-PCI bridge */ + dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; + dev->rom_base_reg = PCI_ROM_ADDRESS1; + } else if (!strcmp(type, "cardbus")) { + dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; + } else { + dev->hdr_type = PCI_HEADER_TYPE_NORMAL; + dev->rom_base_reg = PCI_ROM_ADDRESS; + /* Maybe do a default OF mapping here */ + dev->irq = NO_IRQ; + } + + of_pci_parse_addrs(node, dev); + + pr_debug(" adding to system ...\n"); + + pci_device_add(dev, bus); + + return dev; +} +EXPORT_SYMBOL(of_create_pci_dev); + +/** + * of_scan_pci_bridge - Set up a PCI bridge and scan for child nodes + * @node: device tree node of bridge + * @dev: pci_dev structure for the bridge + * + * of_scan_bus() calls this routine for each PCI bridge that it finds, and + * this routine in turn call of_scan_bus() recusively to scan for more child + * devices. + */ +void __devinit of_scan_pci_bridge(struct device_node *node, + struct pci_dev *dev) +{ + struct pci_bus *bus; + const u32 *busrange, *ranges; + int len, i, mode; + struct resource *res; + unsigned int flags; + u64 size; + + pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); + + /* parse bus-range property */ + busrange = of_get_property(node, "bus-range", &len); + if (busrange == NULL || len != 8) { + printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", + node->full_name); + return; + } + ranges = of_get_property(node, "ranges", &len); + if (ranges == NULL) { + printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", + node->full_name); + return; + } + + bus = pci_add_new_bus(dev->bus, dev, busrange[0]); + if (!bus) { + printk(KERN_ERR "Failed to create pci bus for %s\n", + node->full_name); + return; + } + + bus->primary = dev->bus->number; + bus->subordinate = busrange[1]; + bus->bridge_ctl = 0; + bus->sysdata = node; + + /* parse ranges property */ + /* PCI #address-cells == 3 and #size-cells == 2 always */ + res = &dev->resource[PCI_BRIDGE_RESOURCES]; + for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { + res->flags = 0; + bus->resource[i] = res; + ++res; + } + i = 1; + for (; len >= 32; len -= 32, ranges += 8) { + flags = pci_parse_of_flags(ranges[0], 1); + size = of_read_number(&ranges[6], 2); + if (flags == 0 || size == 0) + continue; + if (flags & IORESOURCE_IO) { + res = bus->resource[0]; + if (res->flags) { + printk(KERN_ERR "PCI: ignoring extra I/O range" + " for bridge %s\n", node->full_name); + continue; + } + } else { + if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { + printk(KERN_ERR "PCI: too many memory ranges" + " for bridge %s\n", node->full_name); + continue; + } + res = bus->resource[i]; + ++i; + } + res->start = of_read_number(&ranges[1], 2); + res->end = res->start + size - 1; + res->flags = flags; + } + sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), + bus->number); + pr_debug(" bus name: %s\n", bus->name); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + pr_debug(" probe mode: %d\n", mode); + + if (mode == PCI_PROBE_DEVTREE) + of_scan_bus(node, bus); + else if (mode == PCI_PROBE_NORMAL) + pci_scan_child_bus(bus); +} +EXPORT_SYMBOL(of_scan_pci_bridge); + +/** + * __of_scan_bus - given a PCI bus node, setup bus and scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + * @rescan_existing: Flag indicating bus has already been set up + */ +static void __devinit __of_scan_bus(struct device_node *node, + struct pci_bus *bus, int rescan_existing) +{ + struct device_node *child; + const u32 *reg; + int reglen, devfn; + struct pci_dev *dev; + + pr_debug("of_scan_bus(%s) bus no %d... \n", + node->full_name, bus->number); + + /* Scan direct children */ + for_each_child_of_node(node, child) { + pr_debug(" * %s\n", child->full_name); + reg = of_get_property(child, "reg", ®len); + if (reg == NULL || reglen < 20) + continue; + devfn = (reg[0] >> 8) & 0xff; + + /* create a new pci_dev for this device */ + dev = of_create_pci_dev(child, bus, devfn); + if (!dev) + continue; + pr_debug(" dev header type: %x\n", dev->hdr_type); + } + + /* Apply all fixups necessary. We don't fixup the bus "self" + * for an existing bridge that is being rescanned + */ + if (!rescan_existing) + pcibios_setup_bus_self(bus); + pcibios_setup_bus_devices(bus); + + /* Now scan child busses */ + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { + struct device_node *child = pci_device_to_OF_node(dev); + if (dev) + of_scan_pci_bridge(child, dev); + } + } +} + +/** + * of_scan_bus - given a PCI bus node, setup bus and scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + */ +void __devinit of_scan_bus(struct device_node *node, + struct pci_bus *bus) +{ + __of_scan_bus(node, bus, 0); +} +EXPORT_SYMBOL_GPL(of_scan_bus); + +/** + * of_rescan_bus - given a PCI bus node, scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + * + * Same as of_scan_bus, but for a pci_bus structure that has already been + * setup. + */ +void __devinit of_rescan_bus(struct device_node *node, + struct pci_bus *bus) +{ + __of_scan_bus(node, bus, 1); +} +EXPORT_SYMBOL_GPL(of_rescan_bus); + -- cgit v1.2.3-70-g09d2 From 89c2dd62a389c5fed07c4b13c906c43214fc7491 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 25 Aug 2009 16:20:45 +0000 Subject: powerpc/pci: Pull ppc32 PCI features into common Some of the PCI features we have in ppc32 we will need on ppc64 platforms in the future. These include support for: * ppc_md.pci_exclude_device * indirect config cycles * early config cycles We also simplified the logic in fake_pci_bus() to assume it will always get a valid pci_controller. Since all current callers seem to pass it one. Signed-off-by: Kumar Gala Acked-by: Grant Likely Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/machdep.h | 6 +-- arch/powerpc/include/asm/pci-bridge.h | 35 ++++++++--------- arch/powerpc/kernel/pci-common.c | 71 +++++++++++++++++++++++++++++++++++ arch/powerpc/kernel/pci_32.c | 71 ----------------------------------- 4 files changed, 90 insertions(+), 93 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 11d1fc3a896..9efa2be7833 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -209,14 +209,14 @@ struct machdep_calls { /* * optional PCI "hooks" */ - /* Called in indirect_* to avoid touching devices */ - int (*pci_exclude_device)(struct pci_controller *, unsigned char, unsigned char); - /* Called at then very end of pcibios_init() */ void (*pcibios_after_init)(void); #endif /* CONFIG_PPC32 */ + /* Called in indirect_* to avoid touching devices */ + int (*pci_exclude_device)(struct pci_controller *, unsigned char, unsigned char); + /* Called after PPC generic resource fixup to perform machine specific fixups */ void (*pcibios_fixup_resources)(struct pci_dev *); diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 3faf575f6b0..76e1f313a58 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -77,9 +77,7 @@ struct pci_controller { int first_busno; int last_busno; -#ifndef CONFIG_PPC64 int self_busno; -#endif void __iomem *io_base_virt; #ifdef CONFIG_PPC64 @@ -104,7 +102,6 @@ struct pci_controller { unsigned int __iomem *cfg_addr; void __iomem *cfg_data; -#ifndef CONFIG_PPC64 /* * Used for variants of PCI indirect handling and possible quirks: * SET_CFG_TYPE - used on 4xx or any PHB that does explicit type0/1 @@ -128,7 +125,6 @@ struct pci_controller { #define PPC_INDIRECT_TYPE_BIG_ENDIAN 0x00000010 #define PPC_INDIRECT_TYPE_BROKEN_MRM 0x00000020 u32 indirect_type; -#endif /* !CONFIG_PPC64 */ /* Currently, we limit ourselves to 1 IO range and 3 mem * ranges since the common pci_bus structure can't handle more */ @@ -146,21 +142,6 @@ struct pci_controller { #endif /* CONFIG_PPC64 */ }; -#ifndef CONFIG_PPC64 - -static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) -{ - return bus->sysdata; -} - -static inline int isa_vaddr_is_ioport(void __iomem *address) -{ - /* No specific ISA handling on ppc32 at this stage, it - * all goes through PCI - */ - return 0; -} - /* These are used for config access before all the PCI probing has been done. */ extern int early_read_config_byte(struct pci_controller *hose, int bus, @@ -182,6 +163,22 @@ extern int early_find_capability(struct pci_controller *hose, int bus, extern void setup_indirect_pci(struct pci_controller* hose, resource_size_t cfg_addr, resource_size_t cfg_data, u32 flags); + +#ifndef CONFIG_PPC64 + +static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) +{ + return bus->sysdata; +} + +static inline int isa_vaddr_is_ioport(void __iomem *address) +{ + /* No specific ISA handling on ppc32 at this stage, it + * all goes through PCI + */ + return 0; +} + #else /* CONFIG_PPC64 */ /* diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 725ea9144e3..8f84a9a8428 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1617,3 +1617,74 @@ void __devinit pcibios_setup_phb_resources(struct pci_controller *hose) (unsigned long)hose->io_base_virt - _IO_BASE); } + +/* + * Null PCI config access functions, for the case when we can't + * find a hose. + */ +#define NULL_PCI_OP(rw, size, type) \ +static int \ +null_##rw##_config_##size(struct pci_dev *dev, int offset, type val) \ +{ \ + return PCIBIOS_DEVICE_NOT_FOUND; \ +} + +static int +null_read_config(struct pci_bus *bus, unsigned int devfn, int offset, + int len, u32 *val) +{ + return PCIBIOS_DEVICE_NOT_FOUND; +} + +static int +null_write_config(struct pci_bus *bus, unsigned int devfn, int offset, + int len, u32 val) +{ + return PCIBIOS_DEVICE_NOT_FOUND; +} + +static struct pci_ops null_pci_ops = +{ + .read = null_read_config, + .write = null_write_config, +}; + +/* + * These functions are used early on before PCI scanning is done + * and all of the pci_dev and pci_bus structures have been created. + */ +static struct pci_bus * +fake_pci_bus(struct pci_controller *hose, int busnr) +{ + static struct pci_bus bus; + + if (hose == 0) { + printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr); + } + bus.number = busnr; + bus.sysdata = hose; + bus.ops = hose? hose->ops: &null_pci_ops; + return &bus; +} + +#define EARLY_PCI_OP(rw, size, type) \ +int early_##rw##_config_##size(struct pci_controller *hose, int bus, \ + int devfn, int offset, type value) \ +{ \ + return pci_bus_##rw##_config_##size(fake_pci_bus(hose, bus), \ + devfn, offset, value); \ +} + +EARLY_PCI_OP(read, byte, u8 *) +EARLY_PCI_OP(read, word, u16 *) +EARLY_PCI_OP(read, dword, u32 *) +EARLY_PCI_OP(write, byte, u8) +EARLY_PCI_OP(write, word, u16) +EARLY_PCI_OP(write, dword, u32) + +extern int pci_bus_find_capability (struct pci_bus *bus, unsigned int devfn, int cap); +int early_find_capability(struct pci_controller *hose, int bus, int devfn, + int cap) +{ + return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); +} diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 1e807fe7ad2..8cf15d961c3 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -469,75 +469,4 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn) return result; } -/* - * Null PCI config access functions, for the case when we can't - * find a hose. - */ -#define NULL_PCI_OP(rw, size, type) \ -static int \ -null_##rw##_config_##size(struct pci_dev *dev, int offset, type val) \ -{ \ - return PCIBIOS_DEVICE_NOT_FOUND; \ -} - -static int -null_read_config(struct pci_bus *bus, unsigned int devfn, int offset, - int len, u32 *val) -{ - return PCIBIOS_DEVICE_NOT_FOUND; -} - -static int -null_write_config(struct pci_bus *bus, unsigned int devfn, int offset, - int len, u32 val) -{ - return PCIBIOS_DEVICE_NOT_FOUND; -} - -static struct pci_ops null_pci_ops = -{ - .read = null_read_config, - .write = null_write_config, -}; -/* - * These functions are used early on before PCI scanning is done - * and all of the pci_dev and pci_bus structures have been created. - */ -static struct pci_bus * -fake_pci_bus(struct pci_controller *hose, int busnr) -{ - static struct pci_bus bus; - - if (hose == 0) { - hose = pci_bus_to_hose(busnr); - if (hose == 0) - printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr); - } - bus.number = busnr; - bus.sysdata = hose; - bus.ops = hose? hose->ops: &null_pci_ops; - return &bus; -} - -#define EARLY_PCI_OP(rw, size, type) \ -int early_##rw##_config_##size(struct pci_controller *hose, int bus, \ - int devfn, int offset, type value) \ -{ \ - return pci_bus_##rw##_config_##size(fake_pci_bus(hose, bus), \ - devfn, offset, value); \ -} - -EARLY_PCI_OP(read, byte, u8 *) -EARLY_PCI_OP(read, word, u16 *) -EARLY_PCI_OP(read, dword, u32 *) -EARLY_PCI_OP(write, byte, u8) -EARLY_PCI_OP(write, word, u16) -EARLY_PCI_OP(write, dword, u32) - -extern int pci_bus_find_capability (struct pci_bus *bus, unsigned int devfn, int cap); -int early_find_capability(struct pci_controller *hose, int bus, int devfn, - int cap) -{ - return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); -} -- cgit v1.2.3-70-g09d2 From e5a6a1c9094839581242c678b11c93c294108696 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 13 Aug 2009 09:37:04 +0000 Subject: powerpc: derive COMMAND_LINE_SIZE from asm-generic The default COMMAND_LINE_SIZE in asm-generic is 512, so the net effect of this change is nil, aside from the cleanup factor. See also commit 2b74b8569. Signed-off-by: Paul Gortmaker Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/setup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 817fac0a071..dae19342f0b 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -1,6 +1,6 @@ #ifndef _ASM_POWERPC_SETUP_H #define _ASM_POWERPC_SETUP_H -#define COMMAND_LINE_SIZE 512 +#include #endif /* _ASM_POWERPC_SETUP_H */ -- cgit v1.2.3-70-g09d2 From 0ed2c722c650513ba4bce868c7a052e576c060e2 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 28 Aug 2009 08:58:16 +0000 Subject: powerpc/pci: Merge ppc32 and ppc64 versions of phb_scan() The two versions are doing almost exactly the same thing. No need to maintain them as separate files. This patch also has the side effect of making the PCI device tree scanning code available to 32 bit powerpc machines, but no board ports actually make use of this feature at this point. Signed-off-by: Grant Likely Acked-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pci.h | 2 ++ arch/powerpc/include/asm/ppc-pci.h | 1 - arch/powerpc/kernel/of_platform.c | 2 +- arch/powerpc/kernel/pci-common.c | 49 ++++++++++++++++++++++++++++++ arch/powerpc/kernel/pci_32.c | 25 ++------------- arch/powerpc/kernel/pci_64.c | 46 ++++------------------------ arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- 7 files changed, 61 insertions(+), 66 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index b856a837b4a..7aca4839387 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -233,6 +233,8 @@ extern void pci_resource_to_user(const struct pci_dev *dev, int bar, extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); +extern void pcibios_setup_phb_io_space(struct pci_controller *hose); +extern void pcibios_scan_phb(struct pci_controller *hose, void *sysdata); #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_PCI_H */ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index 854ab713f56..2828f9d0f66 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -39,7 +39,6 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, extern void pci_devs_phb_init(void); extern void pci_devs_phb_init_dynamic(struct pci_controller *phb); -extern void scan_phb(struct pci_controller *hose); /* From rtas_pci.h */ extern void init_pci_config_tokens (void); diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 87df428e358..1a4fc0d11a0 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -276,7 +276,7 @@ static int __devinit of_pci_phb_probe(struct of_device *dev, #endif /* CONFIG_EEH */ /* Scan the bus */ - scan_phb(phb); + pcibios_scan_phb(phb, dev->node); if (phb->bus == NULL) return -ENXIO; diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 8f84a9a8428..e9f4840096b 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1688,3 +1688,52 @@ int early_find_capability(struct pci_controller *hose, int bus, int devfn, { return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); } + +/** + * pci_scan_phb - Given a pci_controller, setup and scan the PCI bus + * @hose: Pointer to the PCI host controller instance structure + * @sysdata: value to use for sysdata pointer. ppc32 and ppc64 differ here + * + * Note: the 'data' pointer is a temporary measure. As 32 and 64 bit + * pci code gets merged, this parameter should become unnecessary because + * both will use the same value. + */ +void __devinit pcibios_scan_phb(struct pci_controller *hose, void *sysdata) +{ + struct pci_bus *bus; + struct device_node *node = hose->dn; + int mode; + + pr_debug("PCI: Scanning PHB %s\n", + node ? node->full_name : ""); + + /* Create an empty bus for the toplevel */ + bus = pci_create_bus(hose->parent, hose->first_busno, hose->ops, + sysdata); + if (bus == NULL) { + pr_err("Failed to create bus for PCI domain %04x\n", + hose->global_number); + return; + } + bus->secondary = hose->first_busno; + hose->bus = bus; + + /* Get some IO space for the new PHB */ + pcibios_setup_phb_io_space(hose); + + /* Wire up PHB bus resources */ + pcibios_setup_phb_resources(hose); + + /* Get probe mode and perform scan */ + mode = PCI_PROBE_NORMAL; + if (node && ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + pr_debug(" probe mode: %d\n", mode); + if (mode == PCI_PROBE_DEVTREE) { + bus->subordinate = hose->last_busno; + of_scan_bus(node, bus); + } + + if (mode == PCI_PROBE_NORMAL) + hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); +} diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 8cf15d961c3..c13668cf36d 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -354,36 +354,15 @@ pci_create_OF_bus_map(void) } } -static void __devinit pcibios_scan_phb(struct pci_controller *hose) +void __devinit pcibios_setup_phb_io_space(struct pci_controller *hose) { - struct pci_bus *bus; - struct device_node *node = hose->dn; unsigned long io_offset; struct resource *res = &hose->io_resource; - pr_debug("PCI: Scanning PHB %s\n", - node ? node->full_name : ""); - - /* Create an empty bus for the toplevel */ - bus = pci_create_bus(hose->parent, hose->first_busno, hose->ops, hose); - if (bus == NULL) { - printk(KERN_ERR "Failed to create bus for PCI domain %04x\n", - hose->global_number); - return; - } - bus->secondary = hose->first_busno; - hose->bus = bus; - /* Fixup IO space offset */ io_offset = (unsigned long)hose->io_base_virt - isa_io_base; res->start = (res->start + io_offset) & 0xffffffffu; res->end = (res->end + io_offset) & 0xffffffffu; - - /* Wire up PHB bus resources */ - pcibios_setup_phb_resources(hose); - - /* Scan children */ - hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); } static int __init pcibios_init(void) @@ -401,7 +380,7 @@ static int __init pcibios_init(void) if (pci_assign_all_buses) hose->first_busno = next_busno; hose->last_busno = 0xff; - pcibios_scan_phb(hose); + pcibios_scan_phb(hose, hose); pci_bus_add_devices(hose->bus); if (pci_assign_all_buses || next_busno <= hose->last_busno) next_busno = hose->last_busno + pcibios_assign_bus_offset; diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 4d5b4ced7e4..ba949a2c93a 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -43,45 +43,6 @@ unsigned long pci_probe_only = 1; unsigned long pci_io_base = ISA_IO_BASE; EXPORT_SYMBOL(pci_io_base); -void __devinit scan_phb(struct pci_controller *hose) -{ - struct pci_bus *bus; - struct device_node *node = hose->dn; - int mode; - - pr_debug("PCI: Scanning PHB %s\n", - node ? node->full_name : ""); - - /* Create an empty bus for the toplevel */ - bus = pci_create_bus(hose->parent, hose->first_busno, hose->ops, node); - if (bus == NULL) { - printk(KERN_ERR "Failed to create bus for PCI domain %04x\n", - hose->global_number); - return; - } - bus->secondary = hose->first_busno; - hose->bus = bus; - - /* Get some IO space for the new PHB */ - pcibios_map_io_space(bus); - - /* Wire up PHB bus resources */ - pcibios_setup_phb_resources(hose); - - /* Get probe mode and perform scan */ - mode = PCI_PROBE_NORMAL; - if (node && ppc_md.pci_probe_mode) - mode = ppc_md.pci_probe_mode(bus); - pr_debug(" probe mode: %d\n", mode); - if (mode == PCI_PROBE_DEVTREE) { - bus->subordinate = hose->last_busno; - of_scan_bus(node, bus); - } - - if (mode == PCI_PROBE_NORMAL) - hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); -} - static int __init pcibios_init(void) { struct pci_controller *hose, *tmp; @@ -103,7 +64,7 @@ static int __init pcibios_init(void) /* Scan all of the recorded PCI controllers. */ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - scan_phb(hose); + pcibios_scan_phb(hose, hose->dn); pci_bus_add_devices(hose->bus); } @@ -237,6 +198,11 @@ int __devinit pcibios_map_io_space(struct pci_bus *bus) } EXPORT_SYMBOL_GPL(pcibios_map_io_space); +void __devinit pcibios_setup_phb_io_space(struct pci_controller *hose) +{ + pcibios_map_io_space(hose->bus); +} + #define IOBASE_BRIDGE_NUMBER 0 #define IOBASE_MEMORY 1 #define IOBASE_IO 2 diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index ad152a0e394..b6fa3e4b51b 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -151,7 +151,7 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) if (dn->child) eeh_add_device_tree_early(dn); - scan_phb(phb); + pcibios_scan_phb(phb, dn); pcibios_finish_adding_to_bus(phb->bus); return phb; -- cgit v1.2.3-70-g09d2 From 46db2f86a3b2a94e0b33e0b4548fb7b7b6bdff66 Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 28 Aug 2009 12:06:29 +0000 Subject: powerpc/pseries: Fix to handle slb resize across migration The SLB can change sizes across a live migration, which was not being handled, resulting in possible machine crashes during migration if migrating to a machine which has a smaller max SLB size than the source machine. Fix this by first reducing the SLB size to the minimum possible value, which is 32, prior to migration. Then during the device tree update which occurs after migration, we make the call to ensure the SLB gets updated. Also add the slb_size to the lparcfg output so that the migration tools can check to make sure the kernel has this capability before allowing migration in scenarios where the SLB size will change. BenH: Fixed #include -> to avoid breaking ppc32 build Signed-off-by: Brian King Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-hash64.h | 2 ++ arch/powerpc/kernel/lparcfg.c | 3 +++ arch/powerpc/kernel/rtas.c | 7 ++++++- arch/powerpc/mm/slb.c | 16 ++++++++++++---- arch/powerpc/platforms/pseries/reconfig.c | 9 ++++++++- 5 files changed, 31 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index b537903b9fc..bebe31c2e90 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -41,6 +41,7 @@ extern char initial_stab[]; #define SLB_NUM_BOLTED 3 #define SLB_CACHE_ENTRIES 8 +#define SLB_MIN_SIZE 32 /* Bits in the SLB ESID word */ #define SLB_ESID_V ASM_CONST(0x0000000008000000) /* valid */ @@ -276,6 +277,7 @@ extern void slb_flush_and_rebolt(void); extern void stab_initialize(unsigned long stab); extern void slb_vmalloc_update(void); +extern void slb_set_size(u16 size); #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c index 2419cc706ff..ed0ac4e4b8d 100644 --- a/arch/powerpc/kernel/lparcfg.c +++ b/arch/powerpc/kernel/lparcfg.c @@ -35,6 +35,7 @@ #include #include #include +#include #define MODULE_VERS "1.8" #define MODULE_NAME "lparcfg" @@ -537,6 +538,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca[0].shared_proc); + seq_printf(m, "slb_size=%d\n", mmu_slb_size); + return 0; } diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index c434823b8c8..bf90361bb70 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -39,6 +39,7 @@ #include #include #include +#include struct rtas_t rtas = { .lock = __RAW_SPIN_LOCK_UNLOCKED @@ -713,6 +714,7 @@ static void rtas_percpu_suspend_me(void *info) { long rc = H_SUCCESS; unsigned long msr_save; + u16 slb_size = mmu_slb_size; int cpu; struct rtas_suspend_me_data *data = (struct rtas_suspend_me_data *)info; @@ -735,13 +737,16 @@ static void rtas_percpu_suspend_me(void *info) /* All other cpus are in H_JOIN, this cpu does * the suspend. */ + slb_set_size(SLB_MIN_SIZE); printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", smp_processor_id()); data->error = rtas_call(data->token, 0, 1, NULL); - if (data->error) + if (data->error) { printk(KERN_DEBUG "ibm,suspend-me returned %d\n", data->error); + slb_set_size(slb_size); + } } else { printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n", smp_processor_id(), rc); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 07961c5c169..1d98ecc8eec 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -249,14 +249,22 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) static inline void patch_slb_encoding(unsigned int *insn_addr, unsigned int immed) { - /* Assume the instruction had a "0" immediate value, just - * "or" in the new value - */ - *insn_addr |= immed; + *insn_addr = (*insn_addr & 0xffff0000) | immed; flush_icache_range((unsigned long)insn_addr, 4+ (unsigned long)insn_addr); } +void slb_set_size(u16 size) +{ + extern unsigned int *slb_compare_rr_to_size; + + if (mmu_slb_size == size) + return; + + mmu_slb_size = size; + patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); +} + void slb_initialize(void) { unsigned long linear_llp, vmalloc_llp, io_llp; diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index b6f1b137d42..2e2bbe120b9 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -20,6 +20,7 @@ #include #include #include +#include @@ -439,9 +440,15 @@ static int do_update_property(char *buf, size_t bufsize) if (!newprop) return -ENOMEM; + if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size")) + slb_set_size(*(int *)value); + oldprop = of_find_property(np, name,NULL); - if (!oldprop) + if (!oldprop) { + if (strlen(name)) + return prom_add_property(np, newprop); return -ENODEV; + } rc = prom_update_property(np, newprop, oldprop); if (rc) -- cgit v1.2.3-70-g09d2 From 1d5d9527d8ed8d87beb22a4fd954366aeabd12c7 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 1 Sep 2009 15:43:54 +0000 Subject: powerpc/book3e: Add missing page sizes Add defines for the other page sizes. Even if HW doesn't support them we made them use them for hugetlbfs support. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pte-book3e.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 9800565aebb..b82b9dc91a7 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -20,9 +20,19 @@ #define _PAGE_BAP_UX 0x000080 #define _PAGE_PSIZE_MSK 0x000f00 #define _PAGE_PSIZE_4K 0x000200 +#define _PAGE_PSIZE_8K 0x000300 +#define _PAGE_PSIZE_16K 0x000400 +#define _PAGE_PSIZE_32K 0x000500 #define _PAGE_PSIZE_64K 0x000600 +#define _PAGE_PSIZE_128K 0x000700 +#define _PAGE_PSIZE_256K 0x000800 +#define _PAGE_PSIZE_512K 0x000900 #define _PAGE_PSIZE_1M 0x000a00 +#define _PAGE_PSIZE_2M 0x000b00 +#define _PAGE_PSIZE_4M 0x000c00 +#define _PAGE_PSIZE_8M 0x000d00 #define _PAGE_PSIZE_16M 0x000e00 +#define _PAGE_PSIZE_32M 0x000f00 #define _PAGE_DIRTY 0x001000 /* C: page changed */ #define _PAGE_SW0 0x002000 #define _PAGE_U3 0x004000 -- cgit v1.2.3-70-g09d2 From 76acc2c1a7a9a8c2cae7e9cf8d0a8b374a48aa94 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 1 Sep 2009 15:48:42 +0000 Subject: powerpc/fsl-booke: Use HW PTE format if CONFIG_PTE_64BIT Switch to using the Power ISA defined PTE format when we have a 64-bit PTE. This makes the code handling between fsl-booke and book3e-64 similiar for TLB faults. Additionally this lets use take advantage of the page size encodings and full permissions that the HW PTE defines. Also defined _PMD_PRESENT, _PMD_PRESENT_MASK, and _PMD_BAD since the 32-bit ppc arch code expects them. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc32.h | 2 ++ arch/powerpc/include/asm/pte-book3e.h | 3 +++ arch/powerpc/include/asm/pte-fsl-booke.h | 7 ------- arch/powerpc/kernel/head_fsl_booke.S | 36 ++++++++++++++++++++++---------- 4 files changed, 30 insertions(+), 18 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index f2c52e25395..55646adfa84 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -111,6 +111,8 @@ extern int icache_44x_need_flush; #include #elif defined(CONFIG_44x) #include +#elif defined(CONFIG_FSL_BOOKE) && defined(CONFIG_PTE_64BIT) +#include #elif defined(CONFIG_FSL_BOOKE) #include #elif defined(CONFIG_8xx) diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index b82b9dc91a7..082d515930a 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -75,6 +75,9 @@ /* On 32-bit, we never clear the top part of the PTE */ #ifdef CONFIG_PPC32 #define _PTE_NONE_MASK 0xffffffff00000000ULL +#define _PMD_PRESENT 0 +#define _PMD_PRESENT_MASK (PAGE_MASK) +#define _PMD_BAD (~PAGE_MASK) #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/pte-fsl-booke.h b/arch/powerpc/include/asm/pte-fsl-booke.h index ce8a9e94ce7..2c12be5f677 100644 --- a/arch/powerpc/include/asm/pte-fsl-booke.h +++ b/arch/powerpc/include/asm/pte-fsl-booke.h @@ -33,13 +33,6 @@ #define _PAGE_WRITETHRU 0x00400 /* H: W bit */ #define _PAGE_SPECIAL 0x00800 /* S: Special page */ -#ifdef CONFIG_PTE_64BIT -/* ERPN in a PTE never gets cleared, ignore it */ -#define _PTE_NONE_MASK 0xffffffffffff0000ULL -/* We extend the size of the PTE flags area when using 64-bit PTEs */ -#define PTE_RPN_SHIFT (PAGE_SHIFT + 8) -#endif - #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 2c5af525647..975788ca05d 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -575,7 +575,12 @@ interrupt_base: * place or can we save a couple of instructions here ? */ mfspr r12,SPRN_ESR +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT + oris r13,r13,_PAGE_ACCESSED@h +#else li r13,_PAGE_PRESENT|_PAGE_ACCESSED +#endif rlwimi r13,r12,11,29,29 FIND_PTE @@ -643,7 +648,12 @@ interrupt_base: 4: /* Make up the required permissions */ +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT | _PAGE_EXEC + oris r13,r13,_PAGE_ACCESSED@h +#else li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#endif FIND_PTE andc. r13,r13,r11 /* Check permission */ @@ -733,7 +743,7 @@ finish_tlb_load: mfspr r12, SPRN_MAS2 #ifdef CONFIG_PTE_64BIT - rlwimi r12, r11, 26, 24, 31 /* extract ...WIMGE from pte */ + rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ #else rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ #endif @@ -742,6 +752,20 @@ finish_tlb_load: #endif mtspr SPRN_MAS2, r12 +#ifdef CONFIG_PTE_64BIT + rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */ + andi. r10, r11, _PAGE_DIRTY + bne 1f + li r10, MAS3_SW | MAS3_UW + andc r12, r12, r10 +1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */ + rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */ + mtspr SPRN_MAS3, r12 +BEGIN_MMU_FTR_SECTION + srwi r10, r13, 12 /* grab RPN[12:31] */ + mtspr SPRN_MAS7, r10 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) +#else li r10, (_PAGE_EXEC | _PAGE_PRESENT) rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ and r12, r11, r10 @@ -749,16 +773,6 @@ finish_tlb_load: slwi r10, r12, 1 or r10, r10, r12 iseleq r12, r12, r10 - -#ifdef CONFIG_PTE_64BIT - rlwimi r12, r13, 24, 0, 7 /* grab RPN[32:39] */ - rlwimi r12, r11, 24, 8, 19 /* grab RPN[40:51] */ - mtspr SPRN_MAS3, r12 -BEGIN_MMU_FTR_SECTION - srwi r10, r13, 8 /* grab RPN[8:31] */ - mtspr SPRN_MAS7, r10 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) -#else rlwimi r11, r12, 0, 20, 31 /* Extract RPN from PTE and merge with perms */ mtspr SPRN_MAS3, r11 #endif -- cgit v1.2.3-70-g09d2 From bbea0b6e0d214ef1511b9c6ccf3af26b38f0af7d Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Tue, 8 Sep 2009 17:53:04 -0700 Subject: fsldma: Add DMA_SLAVE support Use the DMA_SLAVE capability of the DMAEngine API to copy/from a scatterlist into an arbitrary list of hardware address/length pairs. This allows a single DMA transaction to copy data from several different devices into a scatterlist at the same time. This also adds support to enable some controller-specific features such as external start and external pause for a DMA transaction. [dan.j.williams@intel.com: rebased on tx_list movement] Signed-off-by: Ira W. Snyder Acked-by: Li Yang Acked-by: Kumar Gala Signed-off-by: Dan Williams --- arch/powerpc/include/asm/fsldma.h | 136 +++++++++++++++++++++++ drivers/dma/fsldma.c | 227 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 363 insertions(+) create mode 100644 arch/powerpc/include/asm/fsldma.h (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/fsldma.h b/arch/powerpc/include/asm/fsldma.h new file mode 100644 index 00000000000..a67aeed17d4 --- /dev/null +++ b/arch/powerpc/include/asm/fsldma.h @@ -0,0 +1,136 @@ +/* + * Freescale MPC83XX / MPC85XX DMA Controller + * + * Copyright (c) 2009 Ira W. Snyder + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#ifndef __ARCH_POWERPC_ASM_FSLDMA_H__ +#define __ARCH_POWERPC_ASM_FSLDMA_H__ + +#include + +/* + * Definitions for the Freescale DMA controller's DMA_SLAVE implemention + * + * The Freescale DMA_SLAVE implementation was designed to handle many-to-many + * transfers. An example usage would be an accelerated copy between two + * scatterlists. Another example use would be an accelerated copy from + * multiple non-contiguous device buffers into a single scatterlist. + * + * A DMA_SLAVE transaction is defined by a struct fsl_dma_slave. This + * structure contains a list of hardware addresses that should be copied + * to/from the scatterlist passed into device_prep_slave_sg(). The structure + * also has some fields to enable hardware-specific features. + */ + +/** + * struct fsl_dma_hw_addr + * @entry: linked list entry + * @address: the hardware address + * @length: length to transfer + * + * Holds a single physical hardware address / length pair for use + * with the DMAEngine DMA_SLAVE API. + */ +struct fsl_dma_hw_addr { + struct list_head entry; + + dma_addr_t address; + size_t length; +}; + +/** + * struct fsl_dma_slave + * @addresses: a linked list of struct fsl_dma_hw_addr structures + * @request_count: value for DMA request count + * @src_loop_size: setup and enable constant source-address DMA transfers + * @dst_loop_size: setup and enable constant destination address DMA transfers + * @external_start: enable externally started DMA transfers + * @external_pause: enable externally paused DMA transfers + * + * Holds a list of address / length pairs for use with the DMAEngine + * DMA_SLAVE API implementation for the Freescale DMA controller. + */ +struct fsl_dma_slave { + + /* List of hardware address/length pairs */ + struct list_head addresses; + + /* Support for extra controller features */ + unsigned int request_count; + unsigned int src_loop_size; + unsigned int dst_loop_size; + bool external_start; + bool external_pause; +}; + +/** + * fsl_dma_slave_append - add an address/length pair to a struct fsl_dma_slave + * @slave: the &struct fsl_dma_slave to add to + * @address: the hardware address to add + * @length: the length of bytes to transfer from @address + * + * Add a hardware address/length pair to a struct fsl_dma_slave. Returns 0 on + * success, -ERRNO otherwise. + */ +static inline int fsl_dma_slave_append(struct fsl_dma_slave *slave, + dma_addr_t address, size_t length) +{ + struct fsl_dma_hw_addr *addr; + + addr = kzalloc(sizeof(*addr), GFP_ATOMIC); + if (!addr) + return -ENOMEM; + + INIT_LIST_HEAD(&addr->entry); + addr->address = address; + addr->length = length; + + list_add_tail(&addr->entry, &slave->addresses); + return 0; +} + +/** + * fsl_dma_slave_free - free a struct fsl_dma_slave + * @slave: the struct fsl_dma_slave to free + * + * Free a struct fsl_dma_slave and all associated address/length pairs + */ +static inline void fsl_dma_slave_free(struct fsl_dma_slave *slave) +{ + struct fsl_dma_hw_addr *addr, *tmp; + + if (slave) { + list_for_each_entry_safe(addr, tmp, &slave->addresses, entry) { + list_del(&addr->entry); + kfree(addr); + } + + kfree(slave); + } +} + +/** + * fsl_dma_slave_alloc - allocate a struct fsl_dma_slave + * @gfp: the flags to pass to kmalloc when allocating this structure + * + * Allocate a struct fsl_dma_slave for use by the DMA_SLAVE API. Returns a new + * struct fsl_dma_slave on success, or NULL on failure. + */ +static inline struct fsl_dma_slave *fsl_dma_slave_alloc(gfp_t gfp) +{ + struct fsl_dma_slave *slave; + + slave = kzalloc(sizeof(*slave), gfp); + if (!slave) + return NULL; + + INIT_LIST_HEAD(&slave->addresses); + return slave; +} + +#endif /* __ARCH_POWERPC_ASM_FSLDMA_H__ */ diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 7a0cb6064f8..296f9e747fa 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -34,6 +34,7 @@ #include #include +#include #include "fsldma.h" static void dma_init(struct fsl_dma_chan *fsl_chan) @@ -551,6 +552,229 @@ fail: return NULL; } +/** + * fsl_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction + * @chan: DMA channel + * @sgl: scatterlist to transfer to/from + * @sg_len: number of entries in @scatterlist + * @direction: DMA direction + * @flags: DMAEngine flags + * + * Prepare a set of descriptors for a DMA_SLAVE transaction. Following the + * DMA_SLAVE API, this gets the device-specific information from the + * chan->private variable. + */ +static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg( + struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, + enum dma_data_direction direction, unsigned long flags) +{ + struct fsl_dma_chan *fsl_chan; + struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL; + struct fsl_dma_slave *slave; + struct list_head *tx_list; + size_t copy; + + int i; + struct scatterlist *sg; + size_t sg_used; + size_t hw_used; + struct fsl_dma_hw_addr *hw; + dma_addr_t dma_dst, dma_src; + + if (!chan) + return NULL; + + if (!chan->private) + return NULL; + + fsl_chan = to_fsl_chan(chan); + slave = chan->private; + + if (list_empty(&slave->addresses)) + return NULL; + + hw = list_first_entry(&slave->addresses, struct fsl_dma_hw_addr, entry); + hw_used = 0; + + /* + * Build the hardware transaction to copy from the scatterlist to + * the hardware, or from the hardware to the scatterlist + * + * If you are copying from the hardware to the scatterlist and it + * takes two hardware entries to fill an entire page, then both + * hardware entries will be coalesced into the same page + * + * If you are copying from the scatterlist to the hardware and a + * single page can fill two hardware entries, then the data will + * be read out of the page into the first hardware entry, and so on + */ + for_each_sg(sgl, sg, sg_len, i) { + sg_used = 0; + + /* Loop until the entire scatterlist entry is used */ + while (sg_used < sg_dma_len(sg)) { + + /* + * If we've used up the current hardware address/length + * pair, we need to load a new one + * + * This is done in a while loop so that descriptors with + * length == 0 will be skipped + */ + while (hw_used >= hw->length) { + + /* + * If the current hardware entry is the last + * entry in the list, we're finished + */ + if (list_is_last(&hw->entry, &slave->addresses)) + goto finished; + + /* Get the next hardware address/length pair */ + hw = list_entry(hw->entry.next, + struct fsl_dma_hw_addr, entry); + hw_used = 0; + } + + /* Allocate the link descriptor from DMA pool */ + new = fsl_dma_alloc_descriptor(fsl_chan); + if (!new) { + dev_err(fsl_chan->dev, "No free memory for " + "link descriptor\n"); + goto fail; + } +#ifdef FSL_DMA_LD_DEBUG + dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new); +#endif + + /* + * Calculate the maximum number of bytes to transfer, + * making sure it is less than the DMA controller limit + */ + copy = min_t(size_t, sg_dma_len(sg) - sg_used, + hw->length - hw_used); + copy = min_t(size_t, copy, FSL_DMA_BCR_MAX_CNT); + + /* + * DMA_FROM_DEVICE + * from the hardware to the scatterlist + * + * DMA_TO_DEVICE + * from the scatterlist to the hardware + */ + if (direction == DMA_FROM_DEVICE) { + dma_src = hw->address + hw_used; + dma_dst = sg_dma_address(sg) + sg_used; + } else { + dma_src = sg_dma_address(sg) + sg_used; + dma_dst = hw->address + hw_used; + } + + /* Fill in the descriptor */ + set_desc_cnt(fsl_chan, &new->hw, copy); + set_desc_src(fsl_chan, &new->hw, dma_src); + set_desc_dest(fsl_chan, &new->hw, dma_dst); + + /* + * If this is not the first descriptor, chain the + * current descriptor after the previous descriptor + */ + if (!first) { + first = new; + } else { + set_desc_next(fsl_chan, &prev->hw, + new->async_tx.phys); + } + + new->async_tx.cookie = 0; + async_tx_ack(&new->async_tx); + + prev = new; + sg_used += copy; + hw_used += copy; + + /* Insert the link descriptor into the LD ring */ + list_add_tail(&new->node, &first->tx_list); + } + } + +finished: + + /* All of the hardware address/length pairs had length == 0 */ + if (!first || !new) + return NULL; + + new->async_tx.flags = flags; + new->async_tx.cookie = -EBUSY; + + /* Set End-of-link to the last link descriptor of new list */ + set_ld_eol(fsl_chan, new); + + /* Enable extra controller features */ + if (fsl_chan->set_src_loop_size) + fsl_chan->set_src_loop_size(fsl_chan, slave->src_loop_size); + + if (fsl_chan->set_dest_loop_size) + fsl_chan->set_dest_loop_size(fsl_chan, slave->dst_loop_size); + + if (fsl_chan->toggle_ext_start) + fsl_chan->toggle_ext_start(fsl_chan, slave->external_start); + + if (fsl_chan->toggle_ext_pause) + fsl_chan->toggle_ext_pause(fsl_chan, slave->external_pause); + + if (fsl_chan->set_request_count) + fsl_chan->set_request_count(fsl_chan, slave->request_count); + + return &first->async_tx; + +fail: + /* If first was not set, then we failed to allocate the very first + * descriptor, and we're done */ + if (!first) + return NULL; + + /* + * First is set, so all of the descriptors we allocated have been added + * to first->tx_list, INCLUDING "first" itself. Therefore we + * must traverse the list backwards freeing each descriptor in turn + * + * We're re-using variables for the loop, oh well + */ + tx_list = &first->tx_list; + list_for_each_entry_safe_reverse(new, prev, tx_list, node) { + list_del_init(&new->node); + dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys); + } + + return NULL; +} + +static void fsl_dma_device_terminate_all(struct dma_chan *chan) +{ + struct fsl_dma_chan *fsl_chan; + struct fsl_desc_sw *desc, *tmp; + unsigned long flags; + + if (!chan) + return; + + fsl_chan = to_fsl_chan(chan); + + /* Halt the DMA engine */ + dma_halt(fsl_chan); + + spin_lock_irqsave(&fsl_chan->desc_lock, flags); + + /* Remove and free all of the descriptors in the LD queue */ + list_for_each_entry_safe(desc, tmp, &fsl_chan->ld_queue, node) { + list_del(&desc->node); + dma_pool_free(fsl_chan->desc_pool, desc, desc->async_tx.phys); + } + + spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); +} + /** * fsl_dma_update_completed_cookie - Update the completed cookie. * @fsl_chan : Freescale DMA channel @@ -977,12 +1201,15 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev, dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask); dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask); + dma_cap_set(DMA_SLAVE, fdev->common.cap_mask); fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources; fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources; fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt; fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy; fdev->common.device_is_tx_complete = fsl_dma_is_complete; fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending; + fdev->common.device_prep_slave_sg = fsl_dma_prep_slave_sg; + fdev->common.device_terminate_all = fsl_dma_device_terminate_all; fdev->common.dev = &dev->dev; fdev->irq = irq_of_parse_and_map(dev->node, 0); -- cgit v1.2.3-70-g09d2 From a7db50405216610c8a0d62b8b400180b6f366733 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Mon, 22 Jun 2009 08:08:07 -0600 Subject: PCI: remove pcibios_scan_all_fns() This was #define'd as 0 on all platforms, so let's get rid of it. This change makes pci_scan_slot() slightly easier to read. Cc: Yoshinori Sato Cc: Tony Luck Cc: David Howells Cc: "David S. Miller" Cc: Jeff Dike Cc: Ingo Molnar Cc: Ivan Kokshaysky Reviewed-by: Matthew Wilcox Acked-by: Russell King Acked-by: Ralf Baechle Acked-by: Kyle McMartin Acked-by: Benjamin Herrenschmidt Acked-by: Paul Mundt Acked-by: Arnd Bergmann Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- arch/alpha/include/asm/pci.h | 1 - arch/arm/include/asm/pci.h | 2 -- arch/h8300/include/asm/pci.h | 1 - arch/ia64/include/asm/pci.h | 14 ++++++++++++-- arch/mips/include/asm/pci.h | 2 -- arch/mn10300/include/asm/pci.h | 13 ++++++++++++- arch/parisc/include/asm/pci.h | 1 - arch/powerpc/include/asm/pci.h | 1 - arch/sh/include/asm/pci.h | 1 - arch/sparc/include/asm/pci_32.h | 1 - arch/sparc/include/asm/pci_64.h | 1 - arch/um/include/asm/pci.h | 1 - arch/x86/include/asm/pci.h | 1 - drivers/pci/probe.c | 3 +-- include/asm-generic/pci.h | 13 ++++++++++++- 15 files changed, 37 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h index d22ace99d13..dd8dcabf160 100644 --- a/arch/alpha/include/asm/pci.h +++ b/arch/alpha/include/asm/pci.h @@ -52,7 +52,6 @@ struct pci_controller { bus numbers. */ #define pcibios_assign_all_busses() 1 -#define pcibios_scan_all_fns(a, b) 0 #define PCIBIOS_MIN_IO alpha_mv.min_io_address #define PCIBIOS_MIN_MEM alpha_mv.min_mem_address diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h index 0abf386ba3d..226cddd2fb6 100644 --- a/arch/arm/include/asm/pci.h +++ b/arch/arm/include/asm/pci.h @@ -6,8 +6,6 @@ #include /* for PCIBIOS_MIN_* */ -#define pcibios_scan_all_fns(a, b) 0 - #ifdef CONFIG_PCI_HOST_ITE8152 /* ITE bridge requires setting latency timer to avoid early bus access termination by PIC bus mater devices diff --git a/arch/h8300/include/asm/pci.h b/arch/h8300/include/asm/pci.h index 97389b35aa3..cc9762091c0 100644 --- a/arch/h8300/include/asm/pci.h +++ b/arch/h8300/include/asm/pci.h @@ -8,7 +8,6 @@ */ #define pcibios_assign_all_busses() 0 -#define pcibios_scan_all_fns(a, b) 0 static inline void pcibios_set_master(struct pci_dev *dev) { diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h index fcfca56bb85..55281aabe5f 100644 --- a/arch/ia64/include/asm/pci.h +++ b/arch/ia64/include/asm/pci.h @@ -17,7 +17,6 @@ * loader. */ #define pcibios_assign_all_busses() 0 -#define pcibios_scan_all_fns(a, b) 0 #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM 0x10000000 @@ -135,7 +134,18 @@ extern void pcibios_resource_to_bus(struct pci_dev *dev, extern void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, struct pci_bus_region *region); -#define pcibios_scan_all_fns(a, b) 0 +static inline struct resource * +pcibios_select_root(struct pci_dev *pdev, struct resource *res) +{ + struct resource *root = NULL; + + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + + return root; +} #define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index a68d111e55e..5ebf82572ec 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -65,8 +65,6 @@ extern int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin); extern unsigned int pcibios_assign_all_busses(void); -#define pcibios_scan_all_fns(a, b) 0 - extern unsigned long PCIBIOS_MIN_IO; extern unsigned long PCIBIOS_MIN_MEM; diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h index 19aecc90f7a..6095a28561d 100644 --- a/arch/mn10300/include/asm/pci.h +++ b/arch/mn10300/include/asm/pci.h @@ -101,7 +101,18 @@ extern void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, struct pci_bus_region *region); -#define pcibios_scan_all_fns(a, b) 0 +static inline struct resource * +pcibios_select_root(struct pci_dev *pdev, struct resource *res) +{ + struct resource *root = NULL; + + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + + return root; +} static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) { diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h index 7d842d699df..64c7aa590ae 100644 --- a/arch/parisc/include/asm/pci.h +++ b/arch/parisc/include/asm/pci.h @@ -233,7 +233,6 @@ static inline void pcibios_register_hba(struct pci_hba_data *x) * rp7420/8420 boxes and then revisit this issue. */ #define pcibios_assign_all_busses() (1) -#define pcibios_scan_all_fns(a, b) (0) #define PCIBIOS_MIN_IO 0x10 #define PCIBIOS_MIN_MEM 0x1000 /* NBPG - but pci/setup-res.c dies */ diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index d9483c504d2..36057c821ff 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -40,7 +40,6 @@ struct pci_dev; */ #define pcibios_assign_all_busses() \ (ppc_pci_has_flag(PPC_PCI_REASSIGN_ALL_BUS)) -#define pcibios_scan_all_fns(a, b) 0 static inline void pcibios_set_master(struct pci_dev *dev) { diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index d3633f513eb..4163950cd1c 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -10,7 +10,6 @@ or architectures with incomplete PCI setup by the loader */ #define pcibios_assign_all_busses() 1 -#define pcibios_scan_all_fns(a, b) 0 /* * A board can define one or more PCI channels that represent built-in (or diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h index b41c4c19815..810d9248e23 100644 --- a/arch/sparc/include/asm/pci_32.h +++ b/arch/sparc/include/asm/pci_32.h @@ -10,7 +10,6 @@ * or architectures with incomplete PCI setup by the loader. */ #define pcibios_assign_all_busses() 0 -#define pcibios_scan_all_fns(a, b) 0 #define PCIBIOS_MIN_IO 0UL #define PCIBIOS_MIN_MEM 0UL diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h index 7a1e3566e59..a3297088828 100644 --- a/arch/sparc/include/asm/pci_64.h +++ b/arch/sparc/include/asm/pci_64.h @@ -10,7 +10,6 @@ * or architectures with incomplete PCI setup by the loader. */ #define pcibios_assign_all_busses() 0 -#define pcibios_scan_all_fns(a, b) 0 #define PCIBIOS_MIN_IO 0UL #define PCIBIOS_MIN_MEM 0UL diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h index 59923199cdc..b44cf59ede1 100644 --- a/arch/um/include/asm/pci.h +++ b/arch/um/include/asm/pci.h @@ -2,6 +2,5 @@ #define __UM_PCI_H #define PCI_DMA_BUS_IS_PHYS (1) -#define pcibios_scan_all_fns(a, b) 0 #endif diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1ff685ca221..f76a162c082 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void); #else #define pcibios_assign_all_busses() 0 #endif -#define pcibios_scan_all_fns(a, b) 0 extern unsigned long pci_mem_start; #define PCIBIOS_MIN_IO 0x1000 diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 40e75f6a505..b9d4e95aafb 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1061,8 +1061,7 @@ int pci_scan_slot(struct pci_bus *bus, int devfn) if (dev && !dev->is_added) /* new device? */ nr++; - if ((dev && dev->multifunction) || - (!dev && pcibios_scan_all_fns(bus, devfn))) { + if (dev && dev->multifunction) { for (fn = 1; fn < 8; fn++) { dev = pci_scan_single_device(bus, devfn + fn); if (dev) { diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h index b4326b5466e..26373cff454 100644 --- a/include/asm-generic/pci.h +++ b/include/asm-generic/pci.h @@ -30,7 +30,18 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, res->end = region->end; } -#define pcibios_scan_all_fns(a, b) 0 +static inline struct resource * +pcibios_select_root(struct pci_dev *pdev, struct resource *res) +{ + struct resource *root = NULL; + + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + + return root; +} #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) -- cgit v1.2.3-70-g09d2 From 5b7c1a2c17e77cd5416755bb9ac63278996f6c51 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Fri, 5 Jun 2009 14:54:30 +0800 Subject: KVM: ppc: e500: Directly pass pvr to guest Signed-off-by: Liu Yu Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/e500.c | 3 --- arch/powerpc/kvm/emulate.c | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index fddc3ed715f..d4caa6127f5 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -153,7 +153,6 @@ struct kvm_vcpu_arch { u32 pid; u32 swap_pid; - u32 pvr; u32 ccr0; u32 ccr1; u32 dbcr0; diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 674e796f7aa..64949eef43f 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -60,9 +60,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_e500_tlb_setup(vcpu_e500); - /* Use the same core vertion as host's */ - vcpu->arch.pvr = mfspr(SPRN_PVR); - return 0; } diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index a561d6e8da1..f8b8248cb9b 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -187,7 +187,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_SRR1: vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; case SPRN_PVR: - vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; + vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. -- cgit v1.2.3-70-g09d2 From ec04b2604c3707a46db1d26d98f82b11d0844669 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 19 Jun 2009 15:16:23 +0200 Subject: KVM: Prepare memslot data structures for multiple hugepage sizes [avi: fix build on non-x86] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 3 +- arch/powerpc/include/asm/kvm_host.h | 3 +- arch/s390/include/asm/kvm_host.h | 6 +++- arch/x86/include/asm/kvm_host.h | 12 ++++---- arch/x86/kvm/mmu.c | 30 ++++++++++---------- arch/x86/kvm/paging_tmpl.h | 3 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 56 ++++++++++++++++++++++++++----------- 8 files changed, 73 insertions(+), 42 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 9cf1c4b1f92..d9b6325a932 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -235,7 +235,8 @@ struct kvm_vm_data { #define KVM_REQ_PTC_G 32 #define KVM_REQ_RESUME 33 -#define KVM_PAGES_PER_HPAGE 1 +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) 1 struct kvm; struct kvm_vcpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d4caa6127f5..c9c930ed11d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -34,7 +34,8 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 /* We don't currently support large pages. */ -#define KVM_PAGES_PER_HPAGE (1UL << 31) +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) struct kvm; struct kvm_run; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 75535d4d7a0..78e07a622b4 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -40,7 +40,11 @@ struct sca_block { struct sca_entry cpu[64]; } __attribute__((packed)); -#define KVM_PAGES_PER_HPAGE 256 +#define KVM_NR_PAGE_SIZES 2 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define CPUSTAT_HOST 0x80000000 #define CPUSTAT_WAIT 0x10000000 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 19027ab2041..30b625d8e5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -54,12 +54,12 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -/* shadow tables are PAE even on non-PAE hosts */ -#define KVM_HPAGE_SHIFT 21 -#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) -#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) - -#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) +/* KVM Hugepage definitions for x86 */ +#define KVM_NR_PAGE_SIZES 2 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define DE_VECTOR 0 #define DB_VECTOR 1 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 12974de88aa..b67585c1ef0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -394,9 +394,9 @@ static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); - return &slot->lpage_info[idx].write_count; + idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); + return &slot->lpage_info[0][idx].write_count; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) @@ -485,10 +485,10 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) if (!lpage) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); + idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); - return &slot->lpage_info[idx].rmap_pde; + return &slot->lpage_info[0][idx].rmap_pde; } /* @@ -731,11 +731,11 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + int idx = gfn_offset / + KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); retval |= handler(kvm, &memslot->rmap[gfn_offset]); retval |= handler(kvm, - &memslot->lpage_info[ - gfn_offset / - KVM_PAGES_PER_HPAGE].rmap_pde); + &memslot->lpage_info[0][idx].rmap_pde); } } @@ -1876,8 +1876,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } @@ -2082,8 +2083,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; @@ -2485,7 +2487,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); vcpu->arch.update_pte.largepage = 1; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 322e8113aee..53e129cec5f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -401,7 +401,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; - large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); + large_gfn = walker.gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); if (is_largepage_backed(vcpu, large_gfn)) { walker.gfn = large_gfn; largepage = 1; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6988858dc56..06af936a250 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -103,7 +103,7 @@ struct kvm_memory_slot { struct { unsigned long rmap_pde; int write_count; - } *lpage_info; + } *lpage_info[KVM_NR_PAGE_SIZES - 1]; unsigned long userspace_addr; int user_alloc; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1da8072d61b..8361662e7e0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1001,19 +1001,25 @@ out: static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + int i; + if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) vfree(free->dirty_bitmap); - if (!dont || free->lpage_info != dont->lpage_info) - vfree(free->lpage_info); + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { + vfree(free->lpage_info[i]); + free->lpage_info[i] = NULL; + } + } free->npages = 0; free->dirty_bitmap = NULL; free->rmap = NULL; - free->lpage_info = NULL; } void kvm_free_physmem(struct kvm *kvm) @@ -1087,7 +1093,8 @@ int __kvm_set_memory_region(struct kvm *kvm, int r; gfn_t base_gfn; unsigned long npages, ugfn; - unsigned long largepages, i; + int lpages; + unsigned long i, j; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; @@ -1161,33 +1168,48 @@ int __kvm_set_memory_region(struct kvm *kvm, else new.userspace_addr = 0; } - if (npages && !new.lpage_info) { - largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; - largepages -= base_gfn / KVM_PAGES_PER_HPAGE; + if (!npages) + goto skip_lpage; - new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + int level = i + 2; - if (!new.lpage_info) + /* Avoid unused variable warning if no large pages */ + (void)level; + + if (new.lpage_info[i]) + continue; + + lpages = 1 + (base_gfn + npages - 1) / + KVM_PAGES_PER_HPAGE(level); + lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); + + new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); + + if (!new.lpage_info[i]) goto out_free; - memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); + memset(new.lpage_info[i], 0, + lpages * sizeof(*new.lpage_info[i])); - if (base_gfn % KVM_PAGES_PER_HPAGE) - new.lpage_info[0].write_count = 1; - if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) - new.lpage_info[largepages-1].write_count = 1; + if (base_gfn % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][0].write_count = 1; + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each * other, or if explicitly asked to, disable large page * support for this slot */ - if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1) || + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || !largepages_enabled) - for (i = 0; i < largepages; ++i) - new.lpage_info[i].write_count = 1; + for (j = 0; j < lpages; ++j) + new.lpage_info[i][j].write_count = 1; } +skip_lpage: + /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; -- cgit v1.2.3-70-g09d2 From 8708d002c416b8bf87351bc626d15d7407896edb Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 10 Sep 2009 12:47:20 +0000 Subject: powerpc/irq: Improve nanodoc The OF helpers look like nanodoc but are missing the header. Fix this and a typo (s/nad/and/) while we are here. Signed-off-by: Wolfram Sang Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/irq.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 0a5137676e1..bbcd1aaf3df 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -302,7 +302,8 @@ extern void irq_free_virt(unsigned int virq, unsigned int count); /* -- OF helpers -- */ -/* irq_create_of_mapping - Map a hardware interrupt into linux virq space +/** + * irq_create_of_mapping - Map a hardware interrupt into linux virq space * @controller: Device node of the interrupt controller * @inspec: Interrupt specifier from the device-tree * @intsize: Size of the interrupt specifier from the device-tree @@ -314,8 +315,8 @@ extern void irq_free_virt(unsigned int virq, unsigned int count); extern unsigned int irq_create_of_mapping(struct device_node *controller, u32 *intspec, unsigned int intsize); - -/* irq_of_parse_and_map - Parse nad Map an interrupt into linux virq space +/** + * irq_of_parse_and_map - Parse and Map an interrupt into linux virq space * @device: Device node of the device whose interrupt is to be mapped * @index: Index of the interrupt to map * -- cgit v1.2.3-70-g09d2 From a6dbf93a2ad853585409e715eb96dca9177e3c39 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 9 Sep 2009 01:26:03 +0000 Subject: powerpc: Fix bug where perf_counters breaks oprofile Currently there is a bug where if you use oprofile on a pSeries machine, then use perf_counters, then use oprofile again, oprofile will not work correctly; it will lose the PMU configuration the next time the hypervisor does a partition context switch, and thereafter won't count anything. Maynard Johnson identified the sequence causing the problem: - oprofile setup calls ppc_enable_pmcs(), which calls pseries_lpar_enable_pmcs, which tells the hypervisor that we want to use the PMU, and sets the "PMU in use" flag in the lppaca. This flag tells the hypervisor whether it needs to save and restore the PMU config. - The perf_counter code sets and clears the "PMU in use" flag directly as it context-switches the PMU between tasks, and leaves it clear when it finishes. - oprofile setup, called for a new oprofile run, calls ppc_enable_pmcs, which does nothing because it has already been called. In particular it doesn't set the "PMU in use" flag. This fixes the problem by arranging for ppc_enable_pmcs to always set the "PMU in use" flag. It makes the perf_counter code call ppc_enable_pmcs also rather than calling the lower-level function directly, and removes the setting of the "PMU in use" flag from pseries_lpar_enable_pmcs, since that is now done in its caller. This also removes the declaration of pasemi_enable_pmcs because it isn't defined anywhere. Reported-by: Maynard Johnson Signed-off-by: Paul Mackerras Cc: --- arch/powerpc/include/asm/pmc.h | 16 ++++++++++++++-- arch/powerpc/kernel/perf_counter.c | 13 +++---------- arch/powerpc/kernel/sysfs.c | 3 +++ arch/powerpc/platforms/pseries/setup.c | 4 ---- 4 files changed, 20 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h index d6a616a1b3e..ccc68b50d05 100644 --- a/arch/powerpc/include/asm/pmc.h +++ b/arch/powerpc/include/asm/pmc.h @@ -27,10 +27,22 @@ extern perf_irq_t perf_irq; int reserve_pmc_hardware(perf_irq_t new_perf_irq); void release_pmc_hardware(void); +void ppc_enable_pmcs(void); #ifdef CONFIG_PPC64 -void power4_enable_pmcs(void); -void pasemi_enable_pmcs(void); +#include + +static inline void ppc_set_pmu_inuse(int inuse) +{ + get_lppaca()->pmcregs_in_use = inuse; +} + +extern void power4_enable_pmcs(void); + +#else /* CONFIG_PPC64 */ + +static inline void ppc_set_pmu_inuse(int inuse) { } + #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 70e1f57f7dd..ccd6b213564 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -62,7 +62,6 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs) { return 0; } -static inline void perf_set_pmu_inuse(int inuse) { } static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } static inline u32 perf_get_misc_flags(struct pt_regs *regs) { @@ -93,11 +92,6 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs) return 0; } -static inline void perf_set_pmu_inuse(int inuse) -{ - get_lppaca()->pmcregs_in_use = inuse; -} - /* * The user wants a data address recorded. * If we're not doing instruction sampling, give them the SDAR @@ -531,8 +525,7 @@ void hw_perf_disable(void) * Check if we ever enabled the PMU on this cpu. */ if (!cpuhw->pmcs_enabled) { - if (ppc_md.enable_pmcs) - ppc_md.enable_pmcs(); + ppc_enable_pmcs(); cpuhw->pmcs_enabled = 1; } @@ -594,7 +587,7 @@ void hw_perf_enable(void) mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); if (cpuhw->n_counters == 0) - perf_set_pmu_inuse(0); + ppc_set_pmu_inuse(0); goto out_enable; } @@ -627,7 +620,7 @@ void hw_perf_enable(void) * bit set and set the hardware counters to their initial values. * Then unfreeze the counters. */ - perf_set_pmu_inuse(1); + ppc_set_pmu_inuse(1); mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index f41aec85aa4..956ab33fd73 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "cacheinfo.h" @@ -123,6 +124,8 @@ static DEFINE_PER_CPU(char, pmcs_enabled); void ppc_enable_pmcs(void) { + ppc_set_pmu_inuse(1); + /* Only need to enable them once */ if (__get_cpu_var(pmcs_enabled)) return; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 8d75ea21296..ca5f2e10972 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -223,10 +223,6 @@ static void pseries_lpar_enable_pmcs(void) set = 1UL << 63; reset = 0; plpar_hcall_norets(H_PERFMON, set, reset); - - /* instruct hypervisor to maintain PMCs */ - if (firmware_has_feature(FW_FEATURE_SPLPAR)) - get_lppaca()->pmcregs_in_use = 1; } static void __init pseries_discover_pic(void) -- cgit v1.2.3-70-g09d2 From d331d8305cba713605854aab63a000fb892353a7 Mon Sep 17 00:00:00 2001 From: Martyn Welch Date: Thu, 13 Aug 2009 09:03:02 +0100 Subject: powerpc/nvram: Enable use Generic NVRAM driver for different size chips Remove the reliance on a staticly defined NVRAM size, allowing platforms to support NVRAMs with sizes differing from the standard. A fall back value is provided for platforms not supporting this extension. Signed-off-by: Martyn Welch Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/nvram.h | 3 +++ arch/powerpc/kernel/setup_32.c | 8 ++++++++ drivers/char/generic_nvram.c | 27 ++++++++++++++++++++------- 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index efde5ac82f7..6c587eddee5 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -107,6 +107,9 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern void nvram_sync(void); +/* Determine NVRAM size */ +extern ssize_t nvram_get_size(void); + /* Normal access to NVRAM */ extern unsigned char nvram_read_byte(int i); extern void nvram_write_byte(unsigned char c, int i); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index e1e3059cf34..53bcf3d792d 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -210,6 +210,14 @@ void nvram_write_byte(unsigned char val, int addr) } EXPORT_SYMBOL(nvram_write_byte); +ssize_t nvram_get_size(void) +{ + if (ppc_md.nvram_size) + return ppc_md.nvram_size(); + return -1; +} +EXPORT_SYMBOL(nvram_get_size); + void nvram_sync(void) { if (ppc_md.nvram_sync) diff --git a/drivers/char/generic_nvram.c b/drivers/char/generic_nvram.c index a00869c650d..ef31738c2cb 100644 --- a/drivers/char/generic_nvram.c +++ b/drivers/char/generic_nvram.c @@ -2,7 +2,7 @@ * Generic /dev/nvram driver for architectures providing some * "generic" hooks, that is : * - * nvram_read_byte, nvram_write_byte, nvram_sync + * nvram_read_byte, nvram_write_byte, nvram_sync, nvram_get_size * * Note that an additional hook is supported for PowerMac only * for getting the nvram "partition" informations @@ -28,6 +28,8 @@ #define NVRAM_SIZE 8192 +static ssize_t nvram_len; + static loff_t nvram_llseek(struct file *file, loff_t offset, int origin) { lock_kernel(); @@ -36,7 +38,7 @@ static loff_t nvram_llseek(struct file *file, loff_t offset, int origin) offset += file->f_pos; break; case 2: - offset += NVRAM_SIZE; + offset += nvram_len; break; } if (offset < 0) { @@ -56,9 +58,9 @@ static ssize_t read_nvram(struct file *file, char __user *buf, if (!access_ok(VERIFY_WRITE, buf, count)) return -EFAULT; - if (*ppos >= NVRAM_SIZE) + if (*ppos >= nvram_len) return 0; - for (i = *ppos; count > 0 && i < NVRAM_SIZE; ++i, ++p, --count) + for (i = *ppos; count > 0 && i < nvram_len; ++i, ++p, --count) if (__put_user(nvram_read_byte(i), p)) return -EFAULT; *ppos = i; @@ -74,9 +76,9 @@ static ssize_t write_nvram(struct file *file, const char __user *buf, if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - if (*ppos >= NVRAM_SIZE) + if (*ppos >= nvram_len) return 0; - for (i = *ppos; count > 0 && i < NVRAM_SIZE; ++i, ++p, --count) { + for (i = *ppos; count > 0 && i < nvram_len; ++i, ++p, --count) { if (__get_user(c, p)) return -EFAULT; nvram_write_byte(c, i); @@ -133,9 +135,20 @@ static struct miscdevice nvram_dev = { int __init nvram_init(void) { + int ret = 0; + printk(KERN_INFO "Generic non-volatile memory driver v%s\n", NVRAM_VERSION); - return misc_register(&nvram_dev); + ret = misc_register(&nvram_dev); + if (ret != 0) + goto out; + + nvram_len = nvram_get_size(); + if (nvram_len < 0) + nvram_len = NVRAM_SIZE; + +out: + return ret; } void __exit nvram_cleanup(void) -- cgit v1.2.3-70-g09d2 From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:50:02 +0200 Subject: sched: Merge select_task_rq_fair() and sched_balance_self() The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +- arch/mips/include/asm/mach-ip27/topology.h | 2 +- arch/powerpc/include/asm/topology.h | 5 +- arch/sh/include/asm/topology.h | 4 +- arch/sparc/include/asm/topology_64.h | 4 +- arch/x86/include/asm/topology.h | 4 +- include/linux/sched.h | 7 +- include/linux/topology.h | 16 +- kernel/sched.c | 41 +---- kernel/sched_fair.c | 233 ++++++++--------------------- 10 files changed, 84 insertions(+), 237 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 7b4c8c70b2d..cf6053b226c 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -67,6 +67,7 @@ void build_cpu_to_node_map(void); .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -91,8 +92,8 @@ void build_cpu_to_node_map(void); .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ .nr_balance_failed = 0, \ diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index 07547231e07..d8332398f5b 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -48,7 +48,7 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; .cache_nice_tries = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 054a16d6808..c6343313ff5 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -62,9 +62,8 @@ static inline int pcibus_to_node(struct pci_bus *bus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_NEWIDLE \ - | SD_WAKE_IDLE \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index b69ee850906..dc1531e2f25 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -21,8 +21,8 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index e5ea8d33242..1d091abd2d1 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -57,8 +57,8 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 26d06e052a1..966d58dc627 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -145,14 +145,12 @@ extern unsigned long node_remap_size[]; | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b0ca66bd6c..c30bf3d516d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -803,16 +803,15 @@ enum cpu_idle_type { #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ -#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ +#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ + #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ + #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_BALANCE_WAKE 0x2000 /* Balance on wakeup */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 85e8cf7d393..6a8cd15555b 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -95,14 +95,12 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 1*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ @@ -129,13 +127,11 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ | sd_balance_for_mc_power() \ | sd_power_saving_flags() \ , \ @@ -163,13 +159,11 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 0*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ | sd_balance_for_package_power() \ | sd_power_saving_flags() \ , \ @@ -191,14 +185,12 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 0*SD_BALANCE_EXEC \ | 0*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 0*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ diff --git a/kernel/sched.c b/kernel/sched.c index fc6fda881d2..6c819f338b1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -512,14 +512,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif - this_cpu = get_cpu(); smp_wmb(); @@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds->group_leader); - } - return 1; } @@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd, request = attr->relax_domain_level; if (request < sd->level) { /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } else { /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2eb5b93471..09d19f77eb3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq) se->vruntime = rightmost->vruntime + 1; } -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (rq->rd->online) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) - -#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) - -static int wake_idle(int cpu, struct task_struct *p) -{ - struct sched_domain *sd; - int i; - unsigned int chosen_wakeup_cpu; - int this_cpu; - struct rq *task_rq = task_rq(p); - - /* - * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu - * are idle and this is not a kernel thread and this task's affinity - * allows it to be moved to preferred cpu, then just move! - */ - - this_cpu = smp_processor_id(); - chosen_wakeup_cpu = - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; - - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && - idle_cpu(cpu) && idle_cpu(this_cpu) && - p->mm && !(p->flags & PF_KTHREAD) && - cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) - return chosen_wakeup_cpu; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if ((sd->flags & SD_WAKE_IDLE) - || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq->clock, sd))) { - for_each_cpu_and(i, sched_domain_span(sd), - &p->cpus_allowed) { - if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif -static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, - struct task_struct *p, int prev_cpu, int this_cpu, int sync, - int idx, unsigned long load, unsigned long this_load, - unsigned int imbalance) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; - unsigned long tl = this_load; + struct task_struct *curr = current; + unsigned long this_load, load; + int idx, this_cpu, prev_cpu; unsigned long tl_per_task; + unsigned int imbalance; + struct task_group *tg; unsigned long weight; int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) - return 0; + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || p->se.avg_overlap > sysctl_sched_migration_cost)) @@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(current); weight = current->se.load.weight; - tl += effective_load(tg, this_cpu, -weight, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.load.weight; + imbalance = 100 + (sd->imbalance_pct - 100) / 2; + /* * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped tl to 0, we'll always have - * an imbalance, but there's really nothing you can do about that, so - * that's good too. + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. * * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !tl || - 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* @@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= - tl_per_task)) { + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ - schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.nr_wakeups_affine); return 1; @@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, return 0; } -static int sched_balance_self(int cpu, int flag); - -static int select_task_rq_fair(struct task_struct *p, int flag, int sync) -{ - struct sched_domain *sd, *this_sd = NULL; - int prev_cpu, this_cpu, new_cpu; - unsigned long load, this_load; - struct rq *this_rq; - unsigned int imbalance; - int idx; - - prev_cpu = task_cpu(p); - this_cpu = smp_processor_id(); - this_rq = cpu_rq(this_cpu); - new_cpu = prev_cpu; - - if (flag != SD_BALANCE_WAKE) - return sched_balance_self(this_cpu, flag); - - /* - * 'this_sd' is the first domain that both - * this_cpu and prev_cpu are present in: - */ - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { - this_sd = sd; - break; - } - } - - if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) - goto out; - - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (!this_sd) - goto out; - - idx = this_sd->wake_idx; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); - - if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - return this_cpu; - - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - return this_cpu; - } - } - -out: - return wake_idle(new_cpu, p); -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int sched_balance_self(int cpu, int flag) +static int select_task_rq_fair(struct task_struct *p, int flag, int sync) { struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + + if (flag & SD_BALANCE_WAKE) { + if (sched_feat(AFFINE_WAKEUPS)) + want_affine = 1; + new_cpu = prev_cpu; + } for_each_domain(cpu, tmp) { /* @@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag) */ if (tmp->flags & SD_POWERSAVINGS_BALANCE) break; - if (tmp->flags & flag) - sd = tmp; - } - if (sd) - update_shares(sd); + switch (flag) { + case SD_BALANCE_WAKE: + if (!sched_feat(LB_WAKEUP_UPDATE)) + break; + case SD_BALANCE_FORK: + case SD_BALANCE_EXEC: + if (root_task_group_empty()) + break; + update_shares(tmp); + default: + break; + } + + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + + if (wake_affine(tmp, p, sync)) + return cpu; + + want_affine = 0; + } + + if (!(tmp->flags & flag)) + continue; + + sd = tmp; + } while (sd) { struct sched_group *group; - int new_cpu, weight; + int weight; if (!(sd->flags & flag)) { sd = sd->child; @@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag) /* while loop will break here if sd == NULL */ } - return cpu; + return new_cpu; } #endif /* CONFIG_SMP */ -- cgit v1.2.3-70-g09d2 From 78e7ed53c9f42f04f9401ada6f7047db60781676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2009 13:16:51 +0200 Subject: sched: Tweak wake_idx When merging select_task_rq_fair() and sched_balance_self() we lost the use of wake_idx, restore that and set them to 0 to make wake balancing more aggressive. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +++-- arch/powerpc/include/asm/topology.h | 3 ++- arch/sh/include/asm/topology.h | 2 +- arch/sparc/include/asm/topology_64.h | 2 +- arch/x86/include/asm/topology.h | 2 +- include/linux/topology.h | 4 ++-- kernel/sched_fair.c | 21 ++++++++++++++++++--- 7 files changed, 28 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index cf6053b226c..47f3c51d5e2 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -62,11 +62,12 @@ void build_cpu_to_node_map(void); .busy_idx = 2, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_FORK \ | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ @@ -87,7 +88,7 @@ void build_cpu_to_node_map(void); .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c6343313ff5..a6b220ab56d 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -58,9 +58,10 @@ static inline int pcibus_to_node(struct pci_bus *bus) .busy_idx = 3, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_FORK \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index dc1531e2f25..9054e5c0ad5 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -16,7 +16,7 @@ .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 1d091abd2d1..bc3a0930ed6 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -52,7 +52,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 0, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 966d58dc627..4b1b335097b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -138,7 +138,7 @@ extern unsigned long node_remap_size[]; .busy_idx = 3, \ .idle_idx = SD_IDLE_IDX, \ .newidle_idx = SD_NEWIDLE_IDX, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = SD_FORKEXEC_IDX, \ \ .flags = 1*SD_LOAD_BALANCE \ diff --git a/include/linux/topology.h b/include/linux/topology.h index 6a8cd15555b..fef57040a4e 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -120,7 +120,7 @@ int arch_update_cpu_topology(void); .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ @@ -152,7 +152,7 @@ int arch_update_cpu_topology(void); .busy_idx = 2, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8b3eddbcf9a..19593568031 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1232,12 +1232,27 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * domain. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int flag) { struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + int load_idx = 0; + + switch (flag) { + case SD_BALANCE_FORK: + case SD_BALANCE_EXEC: + load_idx = sd->forkexec_idx; + break; + + case SD_BALANCE_WAKE: + load_idx = sd->wake_idx; + break; + + default: + break; + } do { unsigned long load, avg_load; @@ -1392,7 +1407,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) continue; } - group = find_idlest_group(sd, p, cpu); + group = find_idlest_group(sd, p, cpu, flag); if (!group) { sd = sd->child; continue; -- cgit v1.2.3-70-g09d2 From 0ec9fab3d186d9cbb00c0f694d4a260d07c198d9 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 15 Sep 2009 15:07:03 +0200 Subject: sched: Improve latencies and throughput Make the idle balancer more agressive, to improve a x264 encoding workload provided by Jason Garrett-Glaser: NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 252.82 fps, 22096.60 kb/s encoded 600 frames, 250.69 fps, 22096.60 kb/s encoded 600 frames, 245.76 fps, 22096.60 kb/s NO_NEXT_BUDDY LB_BIAS encoded 600 frames, 344.44 fps, 22096.60 kb/s encoded 600 frames, 346.66 fps, 22096.60 kb/s encoded 600 frames, 352.59 fps, 22096.60 kb/s NO_NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 425.75 fps, 22096.60 kb/s encoded 600 frames, 425.45 fps, 22096.60 kb/s encoded 600 frames, 422.49 fps, 22096.60 kb/s Peter pointed out that this is better done via newidle_idx, not via LB_BIAS, newidle balancing should look for where there is load _now_, not where there was load 2 ticks ago. Worst-case latencies are improved as well as no buddies means less vruntime spread. (as per prior lkml discussions) This change improves kbuild-peak parallelism as well. Reported-by: Jason Garrett-Glaser Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1253011667.9128.16.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +++-- arch/powerpc/include/asm/topology.h | 2 +- arch/sh/include/asm/topology.h | 3 ++- arch/x86/include/asm/topology.h | 4 +--- include/linux/topology.h | 2 +- kernel/sched_features.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 47f3c51d5e2..42f1673ec83 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -61,7 +61,7 @@ void build_cpu_to_node_map(void); .cache_nice_tries = 2, \ .busy_idx = 2, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ @@ -87,10 +87,11 @@ void build_cpu_to_node_map(void); .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_BALANCE_WAKE \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a6b220ab56d..1a2c9eb42a0 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -57,7 +57,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) .cache_nice_tries = 1, \ .busy_idx = 3, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index 9054e5c0ad5..c8436771e31 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -15,13 +15,14 @@ .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ | SD_BALANCE_WAKE \ + | SD_BALANCE_NEWIDLE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 4b1b335097b..7fafd1bc414 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -116,14 +116,12 @@ extern unsigned long node_remap_size[]; # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 -# define SD_NEWIDLE_IDX 2 # define SD_FORKEXEC_IDX 0 #else # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 -# define SD_NEWIDLE_IDX 2 # define SD_FORKEXEC_IDX 1 #endif @@ -137,7 +135,7 @@ extern unsigned long node_remap_size[]; .cache_nice_tries = SD_CACHE_NICE_TRIES, \ .busy_idx = 3, \ .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = SD_NEWIDLE_IDX, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = SD_FORKEXEC_IDX, \ \ diff --git a/include/linux/topology.h b/include/linux/topology.h index c87edcd8796..4298745615a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -151,7 +151,7 @@ int arch_update_cpu_topology(void); .cache_nice_tries = 1, \ .busy_idx = 2, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ \ diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 891ea0f72b4..e98c2e8de1d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -67,7 +67,7 @@ SCHED_FEAT(AFFINE_WAKEUPS, 1) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, 1) +SCHED_FEAT(NEXT_BUDDY, 0) /* * Prefer to schedule the task that ran last (when we did -- cgit v1.2.3-70-g09d2 From 182a85f8a119c789610a9d464f4129ded9f3c107 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 13:24:49 +0200 Subject: sched: Disable wakeup balancing Sysbench thinks SD_BALANCE_WAKE is too agressive and kbuild doesn't really mind too much, SD_BALANCE_NEWIDLE picks up most of the slack. On a dual socket, quad core, dual thread nehalem system: sysbench (--num_threads=16): SD_BALANCE_WAKE-: 13982 tx/s SD_BALANCE_WAKE+: 15688 tx/s kbuild (-j16): SD_BALANCE_WAKE-: 47.648295846 seconds time elapsed ( +- 0.312% ) SD_BALANCE_WAKE+: 47.608607360 seconds time elapsed ( +- 0.026% ) (same within noise) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 2 -- arch/mips/include/asm/mach-ip27/topology.h | 1 - arch/powerpc/include/asm/topology.h | 1 - arch/sh/include/asm/topology.h | 1 - arch/sparc/include/asm/topology_64.h | 1 - arch/x86/include/asm/topology.h | 2 +- include/linux/topology.h | 6 +++--- 7 files changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 569b9dafc78..d0141fbf51d 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -68,7 +68,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -94,7 +93,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index d8332398f5b..23059170700 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -48,7 +48,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; .cache_nice_tries = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 1a2c9eb42a0..394edcbcce7 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -63,7 +63,6 @@ static inline int pcibus_to_node(struct pci_bus *bus) | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index a8cc564b703..f8c40cc6505 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -21,7 +21,6 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE \ | SD_BALANCE_NEWIDLE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 10b979d1de2..26cd25c0839 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 589f12383d7..6f0695d744b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -141,7 +141,7 @@ extern unsigned long node_remap_size[]; | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ diff --git a/include/linux/topology.h b/include/linux/topology.h index a6614b0242a..809b26c0709 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -95,7 +95,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 1*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ @@ -127,7 +127,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 1*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ @@ -160,7 +160,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 1*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ -- cgit v1.2.3-70-g09d2 From cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 12:02:48 +0200 Subject: perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian Acked-by: Peter Zijlstra Acked-by: Paul Mackerras Reviewed-by: Arjan van de Ven Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/arm/include/asm/unistd.h | 2 +- arch/arm/kernel/calls.S | 2 +- arch/blackfin/include/asm/unistd.h | 2 +- arch/blackfin/mach-common/entry.S | 2 +- arch/frv/Kconfig | 2 +- arch/frv/include/asm/perf_counter.h | 17 - arch/frv/include/asm/perf_event.h | 17 + arch/frv/include/asm/unistd.h | 2 +- arch/frv/kernel/entry.S | 2 +- arch/frv/lib/Makefile | 2 +- arch/frv/lib/perf_counter.c | 19 - arch/frv/lib/perf_event.c | 19 + arch/m68k/include/asm/unistd.h | 2 +- arch/m68k/kernel/entry.S | 2 +- arch/m68knommu/kernel/syscalltable.S | 2 +- arch/microblaze/include/asm/unistd.h | 2 +- arch/microblaze/kernel/syscall_table.S | 2 +- arch/mips/include/asm/unistd.h | 6 +- arch/mips/kernel/scall32-o32.S | 2 +- arch/mips/kernel/scall64-64.S | 2 +- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- arch/mn10300/include/asm/unistd.h | 2 +- arch/mn10300/kernel/entry.S | 2 +- arch/parisc/Kconfig | 2 +- arch/parisc/include/asm/perf_counter.h | 7 - arch/parisc/include/asm/perf_event.h | 7 + arch/parisc/include/asm/unistd.h | 4 +- arch/parisc/kernel/syscall_table.S | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/hw_irq.h | 22 +- arch/powerpc/include/asm/paca.h | 2 +- arch/powerpc/include/asm/perf_counter.h | 110 - arch/powerpc/include/asm/perf_event.h | 110 + arch/powerpc/include/asm/systbl.h | 2 +- arch/powerpc/include/asm/unistd.h | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_64.S | 8 +- arch/powerpc/kernel/irq.c | 8 +- arch/powerpc/kernel/mpc7450-pmu.c | 2 +- arch/powerpc/kernel/perf_callchain.c | 2 +- arch/powerpc/kernel/perf_counter.c | 1315 -------- arch/powerpc/kernel/perf_event.c | 1315 ++++++++ arch/powerpc/kernel/power4-pmu.c | 2 +- arch/powerpc/kernel/power5+-pmu.c | 2 +- arch/powerpc/kernel/power5-pmu.c | 2 +- arch/powerpc/kernel/power6-pmu.c | 2 +- arch/powerpc/kernel/power7-pmu.c | 2 +- arch/powerpc/kernel/ppc970-pmu.c | 2 +- arch/powerpc/kernel/time.c | 30 +- arch/powerpc/mm/fault.c | 8 +- arch/powerpc/platforms/Kconfig.cputype | 4 +- arch/s390/Kconfig | 2 +- arch/s390/include/asm/perf_counter.h | 10 - arch/s390/include/asm/perf_event.h | 10 + arch/s390/include/asm/unistd.h | 2 +- arch/s390/kernel/compat_wrapper.S | 8 +- arch/s390/kernel/syscalls.S | 2 +- arch/s390/mm/fault.c | 8 +- arch/sh/Kconfig | 2 +- arch/sh/include/asm/perf_counter.h | 9 - arch/sh/include/asm/perf_event.h | 9 + arch/sh/include/asm/unistd_32.h | 2 +- arch/sh/include/asm/unistd_64.h | 2 +- arch/sh/kernel/syscalls_32.S | 2 +- arch/sh/kernel/syscalls_64.S | 2 +- arch/sh/mm/fault_32.c | 8 +- arch/sh/mm/tlbflush_64.c | 8 +- arch/sparc/Kconfig | 4 +- arch/sparc/include/asm/perf_counter.h | 14 - arch/sparc/include/asm/perf_event.h | 14 + arch/sparc/include/asm/unistd.h | 2 +- arch/sparc/kernel/Makefile | 2 +- arch/sparc/kernel/nmi.c | 4 +- arch/sparc/kernel/pcr.c | 10 +- arch/sparc/kernel/perf_counter.c | 556 ---- arch/sparc/kernel/perf_event.c | 556 ++++ arch/sparc/kernel/systbls_32.S | 2 +- arch/sparc/kernel/systbls_64.S | 4 +- arch/x86/Kconfig | 2 +- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/include/asm/perf_counter.h | 108 - arch/x86/include/asm/perf_event.h | 108 + arch/x86/include/asm/unistd_32.h | 2 +- arch/x86/include/asm/unistd_64.h | 4 +- arch/x86/kernel/apic/apic.c | 6 +- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 4 +- arch/x86/kernel/cpu/perf_counter.c | 2298 -------------- arch/x86/kernel/cpu/perf_event.c | 2298 ++++++++++++++ arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- arch/x86/mm/fault.c | 8 +- arch/x86/oprofile/op_model_ppro.c | 4 +- arch/x86/oprofile/op_x86_model.h | 2 +- drivers/char/sysrq.c | 4 +- fs/exec.c | 6 +- include/asm-generic/unistd.h | 4 +- include/linux/init_task.h | 14 +- include/linux/perf_counter.h | 858 ------ include/linux/perf_event.h | 858 ++++++ include/linux/prctl.h | 4 +- include/linux/sched.h | 12 +- include/linux/syscalls.h | 6 +- include/trace/ftrace.h | 10 +- init/Kconfig | 8 +- kernel/Makefile | 2 +- kernel/exit.c | 8 +- kernel/fork.c | 8 +- kernel/perf_counter.c | 5000 ------------------------------- kernel/perf_event.c | 5000 +++++++++++++++++++++++++++++++ kernel/sched.c | 14 +- kernel/sys.c | 10 +- kernel/sys_ni.c | 2 +- kernel/sysctl.c | 22 +- kernel/timer.c | 4 +- kernel/trace/trace_syscalls.c | 6 +- mm/mmap.c | 6 +- mm/mprotect.c | 4 +- tools/perf/Makefile | 2 +- tools/perf/builtin-annotate.c | 28 +- tools/perf/builtin-record.c | 22 +- tools/perf/builtin-report.c | 48 +- tools/perf/builtin-sched.c | 20 +- tools/perf/builtin-stat.c | 10 +- tools/perf/builtin-timechart.c | 14 +- tools/perf/builtin-top.c | 12 +- tools/perf/builtin-trace.c | 22 +- tools/perf/design.txt | 58 +- tools/perf/perf.h | 12 +- tools/perf/util/event.h | 4 +- tools/perf/util/header.c | 6 +- tools/perf/util/header.h | 8 +- tools/perf/util/parse-events.c | 32 +- tools/perf/util/parse-events.h | 2 +- tools/perf/util/trace-event-info.c | 8 +- tools/perf/util/trace-event.h | 2 +- 141 files changed, 10694 insertions(+), 10694 deletions(-) delete mode 100644 arch/frv/include/asm/perf_counter.h create mode 100644 arch/frv/include/asm/perf_event.h delete mode 100644 arch/frv/lib/perf_counter.c create mode 100644 arch/frv/lib/perf_event.c delete mode 100644 arch/parisc/include/asm/perf_counter.h create mode 100644 arch/parisc/include/asm/perf_event.h delete mode 100644 arch/powerpc/include/asm/perf_counter.h create mode 100644 arch/powerpc/include/asm/perf_event.h delete mode 100644 arch/powerpc/kernel/perf_counter.c create mode 100644 arch/powerpc/kernel/perf_event.c delete mode 100644 arch/s390/include/asm/perf_counter.h create mode 100644 arch/s390/include/asm/perf_event.h delete mode 100644 arch/sh/include/asm/perf_counter.h create mode 100644 arch/sh/include/asm/perf_event.h delete mode 100644 arch/sparc/include/asm/perf_counter.h create mode 100644 arch/sparc/include/asm/perf_event.h delete mode 100644 arch/sparc/kernel/perf_counter.c create mode 100644 arch/sparc/kernel/perf_event.c delete mode 100644 arch/x86/include/asm/perf_counter.h create mode 100644 arch/x86/include/asm/perf_event.h delete mode 100644 arch/x86/kernel/cpu/perf_counter.c create mode 100644 arch/x86/kernel/cpu/perf_event.c delete mode 100644 include/linux/perf_counter.h create mode 100644 include/linux/perf_event.h delete mode 100644 kernel/perf_counter.c create mode 100644 kernel/perf_event.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 9122c9ee18f..89f7eade20a 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -390,7 +390,7 @@ #define __NR_preadv (__NR_SYSCALL_BASE+361) #define __NR_pwritev (__NR_SYSCALL_BASE+362) #define __NR_rt_tgsigqueueinfo (__NR_SYSCALL_BASE+363) -#define __NR_perf_counter_open (__NR_SYSCALL_BASE+364) +#define __NR_perf_event_open (__NR_SYSCALL_BASE+364) /* * The following SWIs are ARM private. diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index ecfa98954d1..fafce1b5c69 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -373,7 +373,7 @@ CALL(sys_preadv) CALL(sys_pwritev) CALL(sys_rt_tgsigqueueinfo) - CALL(sys_perf_counter_open) + CALL(sys_perf_event_open) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c8e7ee4768c..02b1529dad5 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -381,7 +381,7 @@ #define __NR_preadv 366 #define __NR_pwritev 367 #define __NR_rt_tgsigqueueinfo 368 -#define __NR_perf_counter_open 369 +#define __NR_perf_event_open 369 #define __NR_syscall 370 #define NR_syscalls __NR_syscall diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 01af24cde36..1e7cac23e25 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1620,7 +1620,7 @@ ENTRY(_sys_call_table) .long _sys_preadv .long _sys_pwritev .long _sys_rt_tgsigqueueinfo - .long _sys_perf_counter_open + .long _sys_perf_event_open .rept NR_syscalls-(.-_sys_call_table)/4 .long _sys_ni_syscall diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index b86e19c9b5b..4b5830bcbe2 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -7,7 +7,7 @@ config FRV default y select HAVE_IDE select HAVE_ARCH_TRACEHOOK - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config ZONE_DMA bool diff --git a/arch/frv/include/asm/perf_counter.h b/arch/frv/include/asm/perf_counter.h deleted file mode 100644 index ccf726e61b2..00000000000 --- a/arch/frv/include/asm/perf_counter.h +++ /dev/null @@ -1,17 +0,0 @@ -/* FRV performance counter support - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#ifndef _ASM_PERF_COUNTER_H -#define _ASM_PERF_COUNTER_H - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#endif /* _ASM_PERF_COUNTER_H */ diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h new file mode 100644 index 00000000000..a69e0155d14 --- /dev/null +++ b/arch/frv/include/asm/perf_event.h @@ -0,0 +1,17 @@ +/* FRV performance event support + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _ASM_PERF_EVENT_H +#define _ASM_PERF_EVENT_H + +#define PERF_EVENT_INDEX_OFFSET 0 + +#endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 4a8fb427ce0..be6ef0f5cd4 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -342,7 +342,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S index fde1e446b44..189397ec012 100644 --- a/arch/frv/kernel/entry.S +++ b/arch/frv/kernel/entry.S @@ -1525,6 +1525,6 @@ sys_call_table: .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open syscall_table_size = (. - sys_call_table) diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile index 0a377210c89..f4709756d0d 100644 --- a/arch/frv/lib/Makefile +++ b/arch/frv/lib/Makefile @@ -5,4 +5,4 @@ lib-y := \ __ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \ checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \ - outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_counter.o + outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o diff --git a/arch/frv/lib/perf_counter.c b/arch/frv/lib/perf_counter.c deleted file mode 100644 index 2000feecd57..00000000000 --- a/arch/frv/lib/perf_counter.c +++ /dev/null @@ -1,19 +0,0 @@ -/* Performance counter handling - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include - -/* - * mark the performance counter as pending - */ -void set_perf_counter_pending(void) -{ -} diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c new file mode 100644 index 00000000000..9ac5acfd2e9 --- /dev/null +++ b/arch/frv/lib/perf_event.c @@ -0,0 +1,19 @@ +/* Performance event handling + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include + +/* + * mark the performance event as pending + */ +void set_perf_event_pending(void) +{ +} diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 946d8691f2b..48b87f5ced5 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -335,7 +335,7 @@ #define __NR_preadv 329 #define __NR_pwritev 330 #define __NR_rt_tgsigqueueinfo 331 -#define __NR_perf_counter_open 332 +#define __NR_perf_event_open 332 #ifdef __KERNEL__ diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 922f52e7ed1..c5b33634c98 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -756,5 +756,5 @@ sys_call_table: .long sys_preadv .long sys_pwritev /* 330 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S index 0ae123e0898..23535cc415a 100644 --- a/arch/m68knommu/kernel/syscalltable.S +++ b/arch/m68knommu/kernel/syscalltable.S @@ -350,7 +350,7 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev /* 330 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open .rept NR_syscalls-(.-sys_call_table)/4 .long sys_ni_syscall diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index 0b852327c0e..cb05a07e55e 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -381,7 +381,7 @@ #define __NR_preadv 363 /* new */ #define __NR_pwritev 364 /* new */ #define __NR_rt_tgsigqueueinfo 365 /* new */ -#define __NR_perf_counter_open 366 /* new */ +#define __NR_perf_event_open 366 /* new */ #define __NR_syscalls 367 diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S index 457216097df..ecec1915513 100644 --- a/arch/microblaze/kernel/syscall_table.S +++ b/arch/microblaze/kernel/syscall_table.S @@ -370,4 +370,4 @@ ENTRY(sys_call_table) .long sys_ni_syscall .long sys_ni_syscall .long sys_rt_tgsigqueueinfo /* 365 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index e753a777949..8c9dfa9e901 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -353,7 +353,7 @@ #define __NR_preadv (__NR_Linux + 330) #define __NR_pwritev (__NR_Linux + 331) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 332) -#define __NR_perf_counter_open (__NR_Linux + 333) +#define __NR_perf_event_open (__NR_Linux + 333) #define __NR_accept4 (__NR_Linux + 334) /* @@ -664,7 +664,7 @@ #define __NR_preadv (__NR_Linux + 289) #define __NR_pwritev (__NR_Linux + 290) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 291) -#define __NR_perf_counter_open (__NR_Linux + 292) +#define __NR_perf_event_open (__NR_Linux + 292) #define __NR_accept4 (__NR_Linux + 293) /* @@ -979,7 +979,7 @@ #define __NR_preadv (__NR_Linux + 293) #define __NR_pwritev (__NR_Linux + 294) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 295) -#define __NR_perf_counter_open (__NR_Linux + 296) +#define __NR_perf_event_open (__NR_Linux + 296) #define __NR_accept4 (__NR_Linux + 297) /* diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 7c2de4f091c..fd2a9bb620d 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -581,7 +581,7 @@ einval: li v0, -ENOSYS sys sys_preadv 6 /* 4330 */ sys sys_pwritev 6 sys sys_rt_tgsigqueueinfo 4 - sys sys_perf_counter_open 5 + sys sys_perf_event_open 5 sys sys_accept4 4 .endm diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index b97b993846d..18bf7f32c5e 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -418,6 +418,6 @@ sys_call_table: PTR sys_preadv PTR sys_pwritev /* 5390 */ PTR sys_rt_tgsigqueueinfo - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 1a6ae124635..6ebc0797669 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -416,6 +416,6 @@ EXPORT(sysn32_call_table) PTR sys_preadv PTR sys_pwritev PTR compat_sys_rt_tgsigqueueinfo /* 5295 */ - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sysn32_call_table,.-sysn32_call_table diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index cd31087a651..9bbf9775e0b 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -536,6 +536,6 @@ sys_call_table: PTR compat_sys_preadv /* 4330 */ PTR compat_sys_pwritev PTR compat_sys_rt_tgsigqueueinfo - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sys_call_table,.-sys_call_table diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index fad68616af3..2a983931c11 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -347,7 +347,7 @@ #define __NR_preadv 334 #define __NR_pwritev 335 #define __NR_rt_tgsigqueueinfo 336 -#define __NR_perf_counter_open 337 +#define __NR_perf_event_open 337 #ifdef __KERNEL__ diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S index e0d2563af4f..a94e7ea3faa 100644 --- a/arch/mn10300/kernel/entry.S +++ b/arch/mn10300/kernel/entry.S @@ -723,7 +723,7 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev /* 335 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open nr_syscalls=(.-sys_call_table)/4 diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 06f8d5b5b0f..f388dc68f60 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -16,7 +16,7 @@ config PARISC select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE select BUG - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/include/asm/perf_counter.h b/arch/parisc/include/asm/perf_counter.h deleted file mode 100644 index dc9e829f701..00000000000 --- a/arch/parisc/include/asm/perf_counter.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __ASM_PARISC_PERF_COUNTER_H -#define __ASM_PARISC_PERF_COUNTER_H - -/* parisc only supports software counters through this interface. */ -static inline void set_perf_counter_pending(void) { } - -#endif /* __ASM_PARISC_PERF_COUNTER_H */ diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h new file mode 100644 index 00000000000..cc146427d8f --- /dev/null +++ b/arch/parisc/include/asm/perf_event.h @@ -0,0 +1,7 @@ +#ifndef __ASM_PARISC_PERF_EVENT_H +#define __ASM_PARISC_PERF_EVENT_H + +/* parisc only supports software events through this interface. */ +static inline void set_perf_event_pending(void) { } + +#endif /* __ASM_PARISC_PERF_EVENT_H */ diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index f3d3b8b012c..cda158318c6 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -810,9 +810,9 @@ #define __NR_preadv (__NR_Linux + 315) #define __NR_pwritev (__NR_Linux + 316) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 317) -#define __NR_perf_counter_open (__NR_Linux + 318) +#define __NR_perf_event_open (__NR_Linux + 318) -#define __NR_Linux_syscalls (__NR_perf_counter_open + 1) +#define __NR_Linux_syscalls (__NR_perf_event_open + 1) #define __IGNORE_select /* newselect */ diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index cf145eb026b..843f423dec6 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -416,7 +416,7 @@ ENTRY_COMP(preadv) /* 315 */ ENTRY_COMP(pwritev) ENTRY_COMP(rt_tgsigqueueinfo) - ENTRY_SAME(perf_counter_open) + ENTRY_SAME(perf_event_open) /* Nothing yet */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8250902265c..4fd479059d6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,7 +129,7 @@ config PPC select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config EARLY_PRINTK bool diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index e73d554538d..abbc2aaaced 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -135,43 +135,43 @@ static inline int irqs_disabled_flags(unsigned long flags) */ struct irq_chip; -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PPC64 -static inline unsigned long test_perf_counter_pending(void) +static inline unsigned long test_perf_event_pending(void) { unsigned long x; asm volatile("lbz %0,%1(13)" : "=r" (x) - : "i" (offsetof(struct paca_struct, perf_counter_pending))); + : "i" (offsetof(struct paca_struct, perf_event_pending))); return x; } -static inline void set_perf_counter_pending(void) +static inline void set_perf_event_pending(void) { asm volatile("stb %0,%1(13)" : : "r" (1), - "i" (offsetof(struct paca_struct, perf_counter_pending))); + "i" (offsetof(struct paca_struct, perf_event_pending))); } -static inline void clear_perf_counter_pending(void) +static inline void clear_perf_event_pending(void) { asm volatile("stb %0,%1(13)" : : "r" (0), - "i" (offsetof(struct paca_struct, perf_counter_pending))); + "i" (offsetof(struct paca_struct, perf_event_pending))); } #endif /* CONFIG_PPC64 */ -#else /* CONFIG_PERF_COUNTERS */ +#else /* CONFIG_PERF_EVENTS */ -static inline unsigned long test_perf_counter_pending(void) +static inline unsigned long test_perf_event_pending(void) { return 0; } -static inline void clear_perf_counter_pending(void) {} -#endif /* CONFIG_PERF_COUNTERS */ +static inline void clear_perf_event_pending(void) {} +#endif /* CONFIG_PERF_EVENTS */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index b634456ea89..154f405b642 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -122,7 +122,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ - u8 perf_counter_pending; /* PM interrupt while soft-disabled */ + u8 perf_event_pending; /* PM interrupt while soft-disabled */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h deleted file mode 100644 index 0ea0639fcf7..00000000000 --- a/arch/powerpc/include/asm/perf_counter.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Performance counter support - PowerPC-specific definitions. - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include - -#include - -#define MAX_HWCOUNTERS 8 -#define MAX_EVENT_ALTERNATIVES 8 -#define MAX_LIMITED_HWCOUNTERS 2 - -/* - * This struct provides the constants and functions needed to - * describe the PMU on a particular POWER-family CPU. - */ -struct power_pmu { - const char *name; - int n_counter; - int max_alternatives; - unsigned long add_fields; - unsigned long test_adder; - int (*compute_mmcr)(u64 events[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]); - int (*get_constraint)(u64 event, unsigned long *mskp, - unsigned long *valp); - int (*get_alternatives)(u64 event, unsigned int flags, - u64 alt[]); - void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); - int (*limited_pmc_event)(u64 event); - u32 flags; - int n_generic; - int *generic_events; - int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; -}; - -/* - * Values for power_pmu.flags - */ -#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ -#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ - -/* - * Values for flags to get_alternatives() - */ -#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ -#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ -#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ - -extern int register_power_pmu(struct power_pmu *); - -struct pt_regs; -extern unsigned long perf_misc_flags(struct pt_regs *regs); -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); - -#define PERF_COUNTER_INDEX_OFFSET 1 - -/* - * Only override the default definitions in include/linux/perf_counter.h - * if we have hardware PMU support. - */ -#ifdef CONFIG_PPC_PERF_CTRS -#define perf_misc_flags(regs) perf_misc_flags(regs) -#endif - -/* - * The power_pmu.get_constraint function returns a 32/64-bit value and - * a 32/64-bit mask that express the constraints between this event and - * other events. - * - * The value and mask are divided up into (non-overlapping) bitfields - * of three different types: - * - * Select field: this expresses the constraint that some set of bits - * in MMCR* needs to be set to a specific value for this event. For a - * select field, the mask contains 1s in every bit of the field, and - * the value contains a unique value for each possible setting of the - * MMCR* bits. The constraint checking code will ensure that two events - * that set the same field in their masks have the same value in their - * value dwords. - * - * Add field: this expresses the constraint that there can be at most - * N events in a particular class. A field of k bits can be used for - * N <= 2^(k-1) - 1. The mask has the most significant bit of the field - * set (and the other bits 0), and the value has only the least significant - * bit of the field set. In addition, the 'add_fields' and 'test_adder' - * in the struct power_pmu for this processor come into play. The - * add_fields value contains 1 in the LSB of the field, and the - * test_adder contains 2^(k-1) - 1 - N in the field. - * - * NAND field: this expresses the constraint that you may not have events - * in all of a set of classes. (For example, on PPC970, you can't select - * events from the FPU, ISU and IDU simultaneously, although any two are - * possible.) For N classes, the field is N+1 bits wide, and each class - * is assigned one bit from the least-significant N bits. The mask has - * only the most-significant bit set, and the value has only the bit - * for the event's class set. The test_adder has the least significant - * bit set in the field. - * - * If an event is not subject to the constraint expressed by a particular - * field, then it will have 0 in both the mask and value for that field. - */ diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h new file mode 100644 index 00000000000..2499aaadaeb --- /dev/null +++ b/arch/powerpc/include/asm/perf_event.h @@ -0,0 +1,110 @@ +/* + * Performance event support - PowerPC-specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include + +#include + +#define MAX_HWEVENTS 8 +#define MAX_EVENT_ALTERNATIVES 8 +#define MAX_LIMITED_HWEVENTS 2 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { + const char *name; + int n_event; + int max_alternatives; + unsigned long add_fields; + unsigned long test_adder; + int (*compute_mmcr)(u64 events[], int n_ev, + unsigned int hwc[], unsigned long mmcr[]); + int (*get_constraint)(u64 event_id, unsigned long *mskp, + unsigned long *valp); + int (*get_alternatives)(u64 event_id, unsigned int flags, + u64 alt[]); + void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); + int (*limited_pmc_event)(u64 event_id); + u32 flags; + int n_generic; + int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ + +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ + +extern int register_power_pmu(struct power_pmu *); + +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + +#define PERF_EVENT_INDEX_OFFSET 1 + +/* + * Only override the default definitions in include/linux/perf_event.h + * if we have hardware PMU support. + */ +#ifdef CONFIG_PPC_PERF_CTRS +#define perf_misc_flags(regs) perf_misc_flags(regs) +#endif + +/* + * The power_pmu.get_constraint function returns a 32/64-bit value and + * a 32/64-bit mask that express the constraints between this event_id and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event_id. For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits. The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class. A field of k bits can be used for + * N <= 2^(k-1) - 1. The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set. In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play. The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes. (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.) For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits. The mask has + * only the most-significant bit set, and the value has only the bit + * for the event_id's class set. The test_adder has the least significant + * bit set in the field. + * + * If an event_id is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index ed24bd92fe4..c7d671a7d9a 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,7 +322,7 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) -SYSCALL_SPU(perf_counter_open) +SYSCALL_SPU(perf_event_open) COMPAT_SYS_SPU(preadv) COMPAT_SYS_SPU(pwritev) COMPAT_SYS(rt_tgsigqueueinfo) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index cef080bfc60..f6ca7617676 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -341,7 +341,7 @@ #define __NR_dup3 316 #define __NR_pipe2 317 #define __NR_inotify_init1 318 -#define __NR_perf_counter_open 319 +#define __NR_perf_event_open 319 #define __NR_preadv 320 #define __NR_pwritev 321 #define __NR_rt_tgsigqueueinfo 322 diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 569f79ccd31..b23664a0b86 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o -obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o +obj-$(CONFIG_PPC_PERF_CTRS) += perf_event.o perf_callchain.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ power5+-pmu.o power6-pmu.o power7-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f0df285f0f8..0812b0f414b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -133,7 +133,7 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); - DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending)); + DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_PPC_MM_SLICES DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 66bcda34a6b..900e0eea009 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -556,14 +556,14 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) 2: TRACE_AND_RESTORE_IRQ(r5); -#ifdef CONFIG_PERF_COUNTERS - /* check paca->perf_counter_pending if we're enabling ints */ +#ifdef CONFIG_PERF_EVENTS + /* check paca->perf_event_pending if we're enabling ints */ lbz r3,PACAPERFPEND(r13) and. r3,r3,r5 beq 27f - bl .perf_counter_do_pending + bl .perf_event_do_pending 27: -#endif /* CONFIG_PERF_COUNTERS */ +#endif /* CONFIG_PERF_EVENTS */ /* extract EE bit and use it to restore paca->hard_enabled */ ld r3,_MSR(r1) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index f7f376ea7b1..e5d12117798 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -138,9 +138,9 @@ notrace void raw_local_irq_restore(unsigned long en) } #endif /* CONFIG_PPC_STD_MMU_64 */ - if (test_perf_counter_pending()) { - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (test_perf_event_pending()) { + clear_perf_event_pending(); + perf_event_do_pending(); } /* diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c index cc466d039af..09d72028f31 100644 --- a/arch/powerpc/kernel/mpc7450-pmu.c +++ b/arch/powerpc/kernel/mpc7450-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c index f74b62c6751..0a03cf70d24 100644 --- a/arch/powerpc/kernel/perf_callchain.c +++ b/arch/powerpc/kernel/perf_callchain.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c deleted file mode 100644 index 5ccf9bca96c..00000000000 --- a/arch/powerpc/kernel/perf_counter.c +++ /dev/null @@ -1,1315 +0,0 @@ -/* - * Performance counter support - powerpc architecture code - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct cpu_hw_counters { - int n_counters; - int n_percpu; - int disabled; - int n_added; - int n_limited; - u8 pmcs_enabled; - struct perf_counter *counter[MAX_HWCOUNTERS]; - u64 events[MAX_HWCOUNTERS]; - unsigned int flags[MAX_HWCOUNTERS]; - unsigned long mmcr[3]; - struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; - u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS]; - u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; - unsigned long amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; - unsigned long avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; -}; -DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); - -struct power_pmu *ppmu; - -/* - * Normally, to ignore kernel events we set the FCS (freeze counters - * in supervisor mode) bit in MMCR0, but if the kernel runs with the - * hypervisor bit set in the MSR, or if we are running on a processor - * where the hypervisor bit is forced to 1 (as on Apple G5 processors), - * then we need to use the FCHV bit to ignore kernel events. - */ -static unsigned int freeze_counters_kernel = MMCR0_FCS; - -/* - * 32-bit doesn't have MMCRA but does have an MMCR2, - * and a few other names are different. - */ -#ifdef CONFIG_PPC32 - -#define MMCR0_FCHV 0 -#define MMCR0_PMCjCE MMCR0_PMCnCE - -#define SPRN_MMCRA SPRN_MMCR2 -#define MMCRA_SAMPLE_ENABLE 0 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_read_regs(struct pt_regs *regs) { } -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return 0; -} - -#endif /* CONFIG_PPC32 */ - -/* - * Things that are specific to 64-bit implementations. - */ -#ifdef CONFIG_PPC64 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - - if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { - unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; - if (slot > 1) - return 4 * (slot - 1); - } - return 0; -} - -/* - * The user wants a data address recorded. - * If we're not doing instruction sampling, give them the SDAR - * (sampled data address). If we are doing instruction sampling, then - * only give them the SDAR if it corresponds to the instruction - * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC - * bit in MMCRA. - */ -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) -{ - unsigned long mmcra = regs->dsisr; - unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? - POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; - - if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) - *addrp = mfspr(SPRN_SDAR); -} - -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - - if (TRAP(regs) != 0xf00) - return 0; /* not a PMU interrupt */ - - if (ppmu->flags & PPMU_ALT_SIPR) { - if (mmcra & POWER6_MMCRA_SIHV) - return PERF_EVENT_MISC_HYPERVISOR; - return (mmcra & POWER6_MMCRA_SIPR) ? - PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; - } - if (mmcra & MMCRA_SIHV) - return PERF_EVENT_MISC_HYPERVISOR; - return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER : - PERF_EVENT_MISC_KERNEL; -} - -/* - * Overload regs->dsisr to store MMCRA so we only need to read it once - * on each interrupt. - */ -static inline void perf_read_regs(struct pt_regs *regs) -{ - regs->dsisr = mfspr(SPRN_MMCRA); -} - -/* - * If interrupts were soft-disabled when a PMU interrupt occurs, treat - * it as an NMI. - */ -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return !regs->softe; -} - -#endif /* CONFIG_PPC64 */ - -static void perf_counter_interrupt(struct pt_regs *regs); - -void perf_counter_print_debug(void) -{ -} - -/* - * Read one performance monitor counter (PMC). - */ -static unsigned long read_pmc(int idx) -{ - unsigned long val; - - switch (idx) { - case 1: - val = mfspr(SPRN_PMC1); - break; - case 2: - val = mfspr(SPRN_PMC2); - break; - case 3: - val = mfspr(SPRN_PMC3); - break; - case 4: - val = mfspr(SPRN_PMC4); - break; - case 5: - val = mfspr(SPRN_PMC5); - break; - case 6: - val = mfspr(SPRN_PMC6); - break; -#ifdef CONFIG_PPC64 - case 7: - val = mfspr(SPRN_PMC7); - break; - case 8: - val = mfspr(SPRN_PMC8); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to read PMC%d\n", idx); - val = 0; - } - return val; -} - -/* - * Write one PMC. - */ -static void write_pmc(int idx, unsigned long val) -{ - switch (idx) { - case 1: - mtspr(SPRN_PMC1, val); - break; - case 2: - mtspr(SPRN_PMC2, val); - break; - case 3: - mtspr(SPRN_PMC3, val); - break; - case 4: - mtspr(SPRN_PMC4, val); - break; - case 5: - mtspr(SPRN_PMC5, val); - break; - case 6: - mtspr(SPRN_PMC6, val); - break; -#ifdef CONFIG_PPC64 - case 7: - mtspr(SPRN_PMC7, val); - break; - case 8: - mtspr(SPRN_PMC8, val); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to write PMC%d\n", idx); - } -} - -/* - * Check if a set of events can all go on the PMU at once. - * If they can't, this will look at alternative codes for the events - * and see if any combination of alternative codes is feasible. - * The feasible set is returned in event[]. - */ -static int power_check_constraints(struct cpu_hw_counters *cpuhw, - u64 event[], unsigned int cflags[], - int n_ev) -{ - unsigned long mask, value, nv; - unsigned long smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; - int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; - int i, j; - unsigned long addf = ppmu->add_fields; - unsigned long tadd = ppmu->test_adder; - - if (n_ev > ppmu->n_counter) - return -1; - - /* First see if the events will go on as-is */ - for (i = 0; i < n_ev; ++i) { - if ((cflags[i] & PPMU_LIMITED_PMC_REQD) - && !ppmu->limited_pmc_event(event[i])) { - ppmu->get_alternatives(event[i], cflags[i], - cpuhw->alternatives[i]); - event[i] = cpuhw->alternatives[i][0]; - } - if (ppmu->get_constraint(event[i], &cpuhw->amasks[i][0], - &cpuhw->avalues[i][0])) - return -1; - } - value = mask = 0; - for (i = 0; i < n_ev; ++i) { - nv = (value | cpuhw->avalues[i][0]) + - (value & cpuhw->avalues[i][0] & addf); - if ((((nv + tadd) ^ value) & mask) != 0 || - (((nv + tadd) ^ cpuhw->avalues[i][0]) & - cpuhw->amasks[i][0]) != 0) - break; - value = nv; - mask |= cpuhw->amasks[i][0]; - } - if (i == n_ev) - return 0; /* all OK */ - - /* doesn't work, gather alternatives... */ - if (!ppmu->get_alternatives) - return -1; - for (i = 0; i < n_ev; ++i) { - choice[i] = 0; - n_alt[i] = ppmu->get_alternatives(event[i], cflags[i], - cpuhw->alternatives[i]); - for (j = 1; j < n_alt[i]; ++j) - ppmu->get_constraint(cpuhw->alternatives[i][j], - &cpuhw->amasks[i][j], - &cpuhw->avalues[i][j]); - } - - /* enumerate all possibilities and see if any will work */ - i = 0; - j = -1; - value = mask = nv = 0; - while (i < n_ev) { - if (j >= 0) { - /* we're backtracking, restore context */ - value = svalues[i]; - mask = smasks[i]; - j = choice[i]; - } - /* - * See if any alternative k for event i, - * where k > j, will satisfy the constraints. - */ - while (++j < n_alt[i]) { - nv = (value | cpuhw->avalues[i][j]) + - (value & cpuhw->avalues[i][j] & addf); - if ((((nv + tadd) ^ value) & mask) == 0 && - (((nv + tadd) ^ cpuhw->avalues[i][j]) - & cpuhw->amasks[i][j]) == 0) - break; - } - if (j >= n_alt[i]) { - /* - * No feasible alternative, backtrack - * to event i-1 and continue enumerating its - * alternatives from where we got up to. - */ - if (--i < 0) - return -1; - } else { - /* - * Found a feasible alternative for event i, - * remember where we got up to with this event, - * go on to the next event, and start with - * the first alternative for it. - */ - choice[i] = j; - svalues[i] = value; - smasks[i] = mask; - value = nv; - mask |= cpuhw->amasks[i][j]; - ++i; - j = -1; - } - } - - /* OK, we have a feasible combination, tell the caller the solution */ - for (i = 0; i < n_ev; ++i) - event[i] = cpuhw->alternatives[i][choice[i]]; - return 0; -} - -/* - * Check if newly-added counters have consistent settings for - * exclude_{user,kernel,hv} with each other and any previously - * added counters. - */ -static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], - int n_prev, int n_new) -{ - int eu = 0, ek = 0, eh = 0; - int i, n, first; - struct perf_counter *counter; - - n = n_prev + n_new; - if (n <= 1) - return 0; - - first = 1; - for (i = 0; i < n; ++i) { - if (cflags[i] & PPMU_LIMITED_PMC_OK) { - cflags[i] &= ~PPMU_LIMITED_PMC_REQD; - continue; - } - counter = ctrs[i]; - if (first) { - eu = counter->attr.exclude_user; - ek = counter->attr.exclude_kernel; - eh = counter->attr.exclude_hv; - first = 0; - } else if (counter->attr.exclude_user != eu || - counter->attr.exclude_kernel != ek || - counter->attr.exclude_hv != eh) { - return -EAGAIN; - } - } - - if (eu || ek || eh) - for (i = 0; i < n; ++i) - if (cflags[i] & PPMU_LIMITED_PMC_OK) - cflags[i] |= PPMU_LIMITED_PMC_REQD; - - return 0; -} - -static void power_pmu_read(struct perf_counter *counter) -{ - s64 val, delta, prev; - - if (!counter->hw.idx) - return; - /* - * Performance monitor interrupts come even when interrupts - * are soft-disabled, as long as interrupts are hard-enabled. - * Therefore we treat them like NMIs. - */ - do { - prev = atomic64_read(&counter->hw.prev_count); - barrier(); - val = read_pmc(counter->hw.idx); - } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); - - /* The counters are only 32 bits wide */ - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &counter->hw.period_left); -} - -/* - * On some machines, PMC5 and PMC6 can't be written, don't respect - * the freeze conditions, and don't generate interrupts. This tells - * us if `counter' is using such a PMC. - */ -static int is_limited_pmc(int pmcnum) -{ - return (ppmu->flags & PPMU_LIMITED_PMC5_6) - && (pmcnum == 5 || pmcnum == 6); -} - -static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_counter *counter; - u64 val, prev, delta; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - counter = cpuhw->limited_counter[i]; - if (!counter->hw.idx) - continue; - val = (counter->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&counter->hw.prev_count); - counter->hw.idx = 0; - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - } -} - -static void thaw_limited_counters(struct cpu_hw_counters *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_counter *counter; - u64 val; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - counter = cpuhw->limited_counter[i]; - counter->hw.idx = cpuhw->limited_hwidx[i]; - val = (counter->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&counter->hw.prev_count, val); - perf_counter_update_userpage(counter); - } -} - -/* - * Since limited counters don't respect the freeze conditions, we - * have to read them immediately after freezing or unfreezing the - * other counters. We try to keep the values from the limited - * counters as consistent as possible by keeping the delay (in - * cycles and instructions) between freezing/unfreezing and reading - * the limited counters as small and consistent as possible. - * Therefore, if any limited counters are in use, we read them - * both, and always in the same order, to minimize variability, - * and do it inside the same asm that writes MMCR0. - */ -static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) -{ - unsigned long pmc5, pmc6; - - if (!cpuhw->n_limited) { - mtspr(SPRN_MMCR0, mmcr0); - return; - } - - /* - * Write MMCR0, then read PMC5 and PMC6 immediately. - * To ensure we don't get a performance monitor interrupt - * between writing MMCR0 and freezing/thawing the limited - * counters, we first write MMCR0 with the counter overflow - * interrupt enable bits turned off. - */ - asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" - : "=&r" (pmc5), "=&r" (pmc6) - : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), - "i" (SPRN_MMCR0), - "i" (SPRN_PMC5), "i" (SPRN_PMC6)); - - if (mmcr0 & MMCR0_FC) - freeze_limited_counters(cpuhw, pmc5, pmc6); - else - thaw_limited_counters(cpuhw, pmc5, pmc6); - - /* - * Write the full MMCR0 including the counter overflow interrupt - * enable bits, if necessary. - */ - if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) - mtspr(SPRN_MMCR0, mmcr0); -} - -/* - * Disable all counters to prevent PMU interrupts and to allow - * counters to be added or removed. - */ -void hw_perf_disable(void) -{ - struct cpu_hw_counters *cpuhw; - unsigned long flags; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_counters); - - if (!cpuhw->disabled) { - cpuhw->disabled = 1; - cpuhw->n_added = 0; - - /* - * Check if we ever enabled the PMU on this cpu. - */ - if (!cpuhw->pmcs_enabled) { - ppc_enable_pmcs(); - cpuhw->pmcs_enabled = 1; - } - - /* - * Disable instruction sampling if it was enabled - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mtspr(SPRN_MMCRA, - cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mb(); - } - - /* - * Set the 'freeze counters' bit. - * The barrier is to make sure the mtspr has been - * executed and the PMU has frozen the counters - * before we return. - */ - write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); - mb(); - } - local_irq_restore(flags); -} - -/* - * Re-enable all counters if disable == 0. - * If we were previously disabled and counters were added, then - * put the new config on the PMU. - */ -void hw_perf_enable(void) -{ - struct perf_counter *counter; - struct cpu_hw_counters *cpuhw; - unsigned long flags; - long i; - unsigned long val; - s64 left; - unsigned int hwc_index[MAX_HWCOUNTERS]; - int n_lim; - int idx; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_counters); - if (!cpuhw->disabled) { - local_irq_restore(flags); - return; - } - cpuhw->disabled = 0; - - /* - * If we didn't change anything, or only removed counters, - * no need to recalculate MMCR* settings and reset the PMCs. - * Just reenable the PMU with the current MMCR* settings - * (possibly updated for removal of counters). - */ - if (!cpuhw->n_added) { - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - if (cpuhw->n_counters == 0) - ppc_set_pmu_inuse(0); - goto out_enable; - } - - /* - * Compute MMCR* values for the new set of counters - */ - if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, - cpuhw->mmcr)) { - /* shouldn't ever get here */ - printk(KERN_ERR "oops compute_mmcr failed\n"); - goto out; - } - - /* - * Add in MMCR0 freeze bits corresponding to the - * attr.exclude_* bits for the first counter. - * We have already checked that all counters have the - * same values for these bits as the first counter. - */ - counter = cpuhw->counter[0]; - if (counter->attr.exclude_user) - cpuhw->mmcr[0] |= MMCR0_FCP; - if (counter->attr.exclude_kernel) - cpuhw->mmcr[0] |= freeze_counters_kernel; - if (counter->attr.exclude_hv) - cpuhw->mmcr[0] |= MMCR0_FCHV; - - /* - * Write the new configuration to MMCR* with the freeze - * bit set and set the hardware counters to their initial values. - * Then unfreeze the counters. - */ - ppc_set_pmu_inuse(1); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) - | MMCR0_FC); - - /* - * Read off any pre-existing counters that need to move - * to another PMC. - */ - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { - power_pmu_read(counter); - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; - } - } - - /* - * Initialize the PMCs for all the new and moved counters. - */ - cpuhw->n_limited = n_lim = 0; - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter->hw.idx) - continue; - idx = hwc_index[i] + 1; - if (is_limited_pmc(idx)) { - cpuhw->limited_counter[n_lim] = counter; - cpuhw->limited_hwidx[n_lim] = idx; - ++n_lim; - continue; - } - val = 0; - if (counter->hw.sample_period) { - left = atomic64_read(&counter->hw.period_left); - if (left < 0x80000000L) - val = 0x80000000L - left; - } - atomic64_set(&counter->hw.prev_count, val); - counter->hw.idx = idx; - write_pmc(idx, val); - perf_counter_update_userpage(counter); - } - cpuhw->n_limited = n_lim; - cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; - - out_enable: - mb(); - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - /* - * Enable instruction sampling if necessary - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mb(); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); - } - - out: - local_irq_restore(flags); -} - -static int collect_events(struct perf_counter *group, int max_count, - struct perf_counter *ctrs[], u64 *events, - unsigned int *flags) -{ - int n = 0; - struct perf_counter *counter; - - if (!is_software_counter(group)) { - if (n >= max_count) - return -1; - ctrs[n] = group; - flags[n] = group->hw.counter_base; - events[n++] = group->hw.config; - } - list_for_each_entry(counter, &group->sibling_list, list_entry) { - if (!is_software_counter(counter) && - counter->state != PERF_COUNTER_STATE_OFF) { - if (n >= max_count) - return -1; - ctrs[n] = counter; - flags[n] = counter->hw.counter_base; - events[n++] = counter->hw.config; - } - } - return n; -} - -static void counter_sched_in(struct perf_counter *counter, int cpu) -{ - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; - counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; - if (is_software_counter(counter)) - counter->pmu->enable(counter); -} - -/* - * Called to enable a whole group of counters. - * Returns 1 if the group was enabled, or -EAGAIN if it could not be. - * Assumes the caller has disabled interrupts and has - * frozen the PMU with hw_perf_save_disable. - */ -int hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - struct cpu_hw_counters *cpuhw; - long i, n, n0; - struct perf_counter *sub; - - if (!ppmu) - return 0; - cpuhw = &__get_cpu_var(cpu_hw_counters); - n0 = cpuhw->n_counters; - n = collect_events(group_leader, ppmu->n_counter - n0, - &cpuhw->counter[n0], &cpuhw->events[n0], - &cpuhw->flags[n0]); - if (n < 0) - return -EAGAIN; - if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n)) - return -EAGAIN; - i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); - if (i < 0) - return -EAGAIN; - cpuhw->n_counters = n0 + n; - cpuhw->n_added += n; - - /* - * OK, this group can go on; update counter states etc., - * and enable any software counters - */ - for (i = n0; i < n0 + n; ++i) - cpuhw->counter[i]->hw.config = cpuhw->events[i]; - cpuctx->active_oncpu += n; - n = 1; - counter_sched_in(group_leader, cpu); - list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { - if (sub->state != PERF_COUNTER_STATE_OFF) { - counter_sched_in(sub, cpu); - ++n; - } - } - ctx->nr_active += n; - - return 1; -} - -/* - * Add a counter to the PMU. - * If all counters are not already frozen, then we disable and - * re-enable the PMU in order to get hw_perf_enable to do the - * actual work of reconfiguring the PMU. - */ -static int power_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuhw; - unsigned long flags; - int n0; - int ret = -EAGAIN; - - local_irq_save(flags); - perf_disable(); - - /* - * Add the counter to the list (if there is room) - * and check whether the total set is still feasible. - */ - cpuhw = &__get_cpu_var(cpu_hw_counters); - n0 = cpuhw->n_counters; - if (n0 >= ppmu->n_counter) - goto out; - cpuhw->counter[n0] = counter; - cpuhw->events[n0] = counter->hw.config; - cpuhw->flags[n0] = counter->hw.counter_base; - if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1)) - goto out; - if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) - goto out; - - counter->hw.config = cpuhw->events[n0]; - ++cpuhw->n_counters; - ++cpuhw->n_added; - - ret = 0; - out: - perf_enable(); - local_irq_restore(flags); - return ret; -} - -/* - * Remove a counter from the PMU. - */ -static void power_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuhw; - long i; - unsigned long flags; - - local_irq_save(flags); - perf_disable(); - - power_pmu_read(counter); - - cpuhw = &__get_cpu_var(cpu_hw_counters); - for (i = 0; i < cpuhw->n_counters; ++i) { - if (counter == cpuhw->counter[i]) { - while (++i < cpuhw->n_counters) - cpuhw->counter[i-1] = cpuhw->counter[i]; - --cpuhw->n_counters; - ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); - if (counter->hw.idx) { - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; - } - perf_counter_update_userpage(counter); - break; - } - } - for (i = 0; i < cpuhw->n_limited; ++i) - if (counter == cpuhw->limited_counter[i]) - break; - if (i < cpuhw->n_limited) { - while (++i < cpuhw->n_limited) { - cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; - cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; - } - --cpuhw->n_limited; - } - if (cpuhw->n_counters == 0) { - /* disable exceptions if no counters are running */ - cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); - } - - perf_enable(); - local_irq_restore(flags); -} - -/* - * Re-enable interrupts on a counter after they were throttled - * because they were coming too fast. - */ -static void power_pmu_unthrottle(struct perf_counter *counter) -{ - s64 val, left; - unsigned long flags; - - if (!counter->hw.idx || !counter->hw.sample_period) - return; - local_irq_save(flags); - perf_disable(); - power_pmu_read(counter); - left = counter->hw.sample_period; - counter->hw.last_period = left; - val = 0; - if (left < 0x80000000L) - val = 0x80000000L - left; - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); - perf_enable(); - local_irq_restore(flags); -} - -struct pmu power_pmu = { - .enable = power_pmu_enable, - .disable = power_pmu_disable, - .read = power_pmu_read, - .unthrottle = power_pmu_unthrottle, -}; - -/* - * Return 1 if we might be able to put counter on a limited PMC, - * or 0 if not. - * A counter can only go on a limited PMC if it counts something - * that a limited PMC can count, doesn't require interrupts, and - * doesn't exclude any processor mode. - */ -static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, - unsigned int flags) -{ - int n; - u64 alt[MAX_EVENT_ALTERNATIVES]; - - if (counter->attr.exclude_user - || counter->attr.exclude_kernel - || counter->attr.exclude_hv - || counter->attr.sample_period) - return 0; - - if (ppmu->limited_pmc_event(ev)) - return 1; - - /* - * The requested event isn't on a limited PMC already; - * see if any alternative code goes on a limited PMC. - */ - if (!ppmu->get_alternatives) - return 0; - - flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; - n = ppmu->get_alternatives(ev, flags, alt); - - return n > 0; -} - -/* - * Find an alternative event that goes on a normal PMC, if possible, - * and return the event code, or 0 if there is no such alternative. - * (Note: event code 0 is "don't count" on all machines.) - */ -static u64 normal_pmc_alternative(u64 ev, unsigned long flags) -{ - u64 alt[MAX_EVENT_ALTERNATIVES]; - int n; - - flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); - n = ppmu->get_alternatives(ev, flags, alt); - if (!n) - return 0; - return alt[0]; -} - -/* Number of perf_counters counting hardware events */ -static atomic_t num_counters; -/* Used to avoid races in calling reserve/release_pmc_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - -/* - * Release the PMU if this is the last perf_counter. - */ -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - if (!atomic_add_unless(&num_counters, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_counters) == 0) - release_pmc_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -/* - * Translate a generic cache event config to a raw event code. - */ -static int hw_perf_cache_event(u64 config, u64 *eventp) -{ - unsigned long type, op, result; - int ev; - - if (!ppmu->cache_events) - return -EINVAL; - - /* unpack config */ - type = config & 0xff; - op = (config >> 8) & 0xff; - result = (config >> 16) & 0xff; - - if (type >= PERF_COUNT_HW_CACHE_MAX || - op >= PERF_COUNT_HW_CACHE_OP_MAX || - result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - ev = (*ppmu->cache_events)[type][op][result]; - if (ev == 0) - return -EOPNOTSUPP; - if (ev == -1) - return -EINVAL; - *eventp = ev; - return 0; -} - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - u64 ev; - unsigned long flags; - struct perf_counter *ctrs[MAX_HWCOUNTERS]; - u64 events[MAX_HWCOUNTERS]; - unsigned int cflags[MAX_HWCOUNTERS]; - int n; - int err; - struct cpu_hw_counters *cpuhw; - - if (!ppmu) - return ERR_PTR(-ENXIO); - switch (counter->attr.type) { - case PERF_TYPE_HARDWARE: - ev = counter->attr.config; - if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return ERR_PTR(-EOPNOTSUPP); - ev = ppmu->generic_events[ev]; - break; - case PERF_TYPE_HW_CACHE: - err = hw_perf_cache_event(counter->attr.config, &ev); - if (err) - return ERR_PTR(err); - break; - case PERF_TYPE_RAW: - ev = counter->attr.config; - break; - default: - return ERR_PTR(-EINVAL); - } - counter->hw.config_base = ev; - counter->hw.idx = 0; - - /* - * If we are not running on a hypervisor, force the - * exclude_hv bit to 0 so that we don't care what - * the user set it to. - */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - counter->attr.exclude_hv = 0; - - /* - * If this is a per-task counter, then we can use - * PM_RUN_* events interchangeably with their non RUN_* - * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. - * XXX we should check if the task is an idle task. - */ - flags = 0; - if (counter->ctx->task) - flags |= PPMU_ONLY_COUNT_RUN; - - /* - * If this machine has limited counters, check whether this - * event could go on a limited counter. - */ - if (ppmu->flags & PPMU_LIMITED_PMC5_6) { - if (can_go_on_limited_pmc(counter, ev, flags)) { - flags |= PPMU_LIMITED_PMC_OK; - } else if (ppmu->limited_pmc_event(ev)) { - /* - * The requested event is on a limited PMC, - * but we can't use a limited PMC; see if any - * alternative goes on a normal PMC. - */ - ev = normal_pmc_alternative(ev, flags); - if (!ev) - return ERR_PTR(-EINVAL); - } - } - - /* - * If this is in a group, check if it can go on with all the - * other hardware counters in the group. We assume the counter - * hasn't been linked into its leader's sibling list at this point. - */ - n = 0; - if (counter->group_leader != counter) { - n = collect_events(counter->group_leader, ppmu->n_counter - 1, - ctrs, events, cflags); - if (n < 0) - return ERR_PTR(-EINVAL); - } - events[n] = ev; - ctrs[n] = counter; - cflags[n] = flags; - if (check_excludes(ctrs, cflags, n, 1)) - return ERR_PTR(-EINVAL); - - cpuhw = &get_cpu_var(cpu_hw_counters); - err = power_check_constraints(cpuhw, events, cflags, n + 1); - put_cpu_var(cpu_hw_counters); - if (err) - return ERR_PTR(-EINVAL); - - counter->hw.config = events[n]; - counter->hw.counter_base = cflags[n]; - counter->hw.last_period = counter->hw.sample_period; - atomic64_set(&counter->hw.period_left, counter->hw.last_period); - - /* - * See if we need to reserve the PMU. - * If no counters are currently in use, then we have to take a - * mutex to ensure that we don't race with another task doing - * reserve_pmc_hardware or release_pmc_hardware. - */ - err = 0; - if (!atomic_inc_not_zero(&num_counters)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_counters) == 0 && - reserve_pmc_hardware(perf_counter_interrupt)) - err = -EBUSY; - else - atomic_inc(&num_counters); - mutex_unlock(&pmc_reserve_mutex); - } - counter->destroy = hw_perf_counter_destroy; - - if (err) - return ERR_PTR(err); - return &power_pmu; -} - -/* - * A counter has overflowed; update its count and record - * things if requested. Note that interrupts are hard-disabled - * here so there is no possibility of being interrupted. - */ -static void record_and_restart(struct perf_counter *counter, unsigned long val, - struct pt_regs *regs, int nmi) -{ - u64 period = counter->hw.sample_period; - s64 prev, delta, left; - int record = 0; - - /* we don't have to worry about interrupts here */ - prev = atomic64_read(&counter->hw.prev_count); - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - - /* - * See if the total period for this counter has expired, - * and update for the next period. - */ - val = 0; - left = atomic64_read(&counter->hw.period_left) - delta; - if (period) { - if (left <= 0) { - left += period; - if (left <= 0) - left = period; - record = 1; - } - if (left < 0x80000000LL) - val = 0x80000000LL - left; - } - - /* - * Finally record data if requested. - */ - if (record) { - struct perf_sample_data data = { - .addr = 0, - .period = counter->hw.last_period, - }; - - if (counter->attr.sample_type & PERF_SAMPLE_ADDR) - perf_get_data_addr(regs, &data.addr); - - if (perf_counter_overflow(counter, nmi, &data, regs)) { - /* - * Interrupts are coming too fast - throttle them - * by setting the counter to 0, so it will be - * at least 2^30 cycles until the next interrupt - * (assuming each counter counts at most 2 counts - * per cycle). - */ - val = 0; - left = ~0ULL >> 1; - } - } - - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); -} - -/* - * Called from generic code to get the misc flags (i.e. processor mode) - * for an event. - */ -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - u32 flags = perf_get_misc_flags(regs); - - if (flags) - return flags; - return user_mode(regs) ? PERF_EVENT_MISC_USER : - PERF_EVENT_MISC_KERNEL; -} - -/* - * Called from generic code to get the instruction pointer - * for an event. - */ -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - unsigned long ip; - - if (TRAP(regs) != 0xf00) - return regs->nip; /* not a PMU interrupt */ - - ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - return ip; -} - -/* - * Performance monitor interrupt stuff - */ -static void perf_counter_interrupt(struct pt_regs *regs) -{ - int i; - struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); - struct perf_counter *counter; - unsigned long val; - int found = 0; - int nmi; - - if (cpuhw->n_limited) - freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), - mfspr(SPRN_PMC6)); - - perf_read_regs(regs); - - nmi = perf_intr_is_nmi(regs); - if (nmi) - nmi_enter(); - else - irq_enter(); - - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (!counter->hw.idx || is_limited_pmc(counter->hw.idx)) - continue; - val = read_pmc(counter->hw.idx); - if ((int)val < 0) { - /* counter has overflowed */ - found = 1; - record_and_restart(counter, val, regs, nmi); - } - } - - /* - * In case we didn't find and reset the counter that caused - * the interrupt, scan all counters and reset any that are - * negative, to avoid getting continual interrupts. - * Any that we processed in the previous loop will not be negative. - */ - if (!found) { - for (i = 0; i < ppmu->n_counter; ++i) { - if (is_limited_pmc(i + 1)) - continue; - val = read_pmc(i + 1); - if ((int)val < 0) - write_pmc(i + 1, 0); - } - } - - /* - * Reset MMCR0 to its normal value. This will set PMXE and - * clear FC (freeze counters) and PMAO (perf mon alert occurred) - * and thus allow interrupts to occur again. - * XXX might want to use MSR.PM to keep the counters frozen until - * we get back out of this interrupt. - */ - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - if (nmi) - nmi_exit(); - else - irq_exit(); -} - -void hw_perf_counter_setup(int cpu) -{ - struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); - - if (!ppmu) - return; - memset(cpuhw, 0, sizeof(*cpuhw)); - cpuhw->mmcr[0] = MMCR0_FC; -} - -int register_power_pmu(struct power_pmu *pmu) -{ - if (ppmu) - return -EBUSY; /* something's already registered */ - - ppmu = pmu; - pr_info("%s performance monitor hardware support registered\n", - pmu->name); - -#ifdef MSR_HV - /* - * Use FCHV to ignore kernel events if MSR.HV is set. - */ - if (mfmsr() & MSR_HV) - freeze_counters_kernel = MMCR0_FCHV; -#endif /* CONFIG_PPC64 */ - - return 0; -} diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c new file mode 100644 index 00000000000..c98321fcb45 --- /dev/null +++ b/arch/powerpc/kernel/perf_event.c @@ -0,0 +1,1315 @@ +/* + * Performance event support - powerpc architecture code + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cpu_hw_events { + int n_events; + int n_percpu; + int disabled; + int n_added; + int n_limited; + u8 pmcs_enabled; + struct perf_event *event[MAX_HWEVENTS]; + u64 events[MAX_HWEVENTS]; + unsigned int flags[MAX_HWEVENTS]; + unsigned long mmcr[3]; + struct perf_event *limited_event[MAX_LIMITED_HWEVENTS]; + u8 limited_hwidx[MAX_LIMITED_HWEVENTS]; + u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; +}; +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +struct power_pmu *ppmu; + +/* + * Normally, to ignore kernel events we set the FCS (freeze events + * in supervisor mode) bit in MMCR0, but if the kernel runs with the + * hypervisor bit set in the MSR, or if we are running on a processor + * where the hypervisor bit is forced to 1 (as on Apple G5 processors), + * then we need to use the FCHV bit to ignore kernel events. + */ +static unsigned int freeze_events_kernel = MMCR0_FCS; + +/* + * 32-bit doesn't have MMCRA but does have an MMCR2, + * and a few other names are different. + */ +#ifdef CONFIG_PPC32 + +#define MMCR0_FCHV 0 +#define MMCR0_PMCjCE MMCR0_PMCnCE + +#define SPRN_MMCRA SPRN_MMCR2 +#define MMCRA_SAMPLE_ENABLE 0 + +static inline unsigned long perf_ip_adjust(struct pt_regs *regs) +{ + return 0; +} +static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + return 0; +} +static inline void perf_read_regs(struct pt_regs *regs) { } +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ + return 0; +} + +#endif /* CONFIG_PPC32 */ + +/* + * Things that are specific to 64-bit implementations. + */ +#ifdef CONFIG_PPC64 + +static inline unsigned long perf_ip_adjust(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + + if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { + unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; + if (slot > 1) + return 4 * (slot - 1); + } + return 0; +} + +/* + * The user wants a data address recorded. + * If we're not doing instruction sampling, give them the SDAR + * (sampled data address). If we are doing instruction sampling, then + * only give them the SDAR if it corresponds to the instruction + * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC + * bit in MMCRA. + */ +static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) +{ + unsigned long mmcra = regs->dsisr; + unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? + POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; + + if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) + *addrp = mfspr(SPRN_SDAR); +} + +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + + if (TRAP(regs) != 0xf00) + return 0; /* not a PMU interrupt */ + + if (ppmu->flags & PPMU_ALT_SIPR) { + if (mmcra & POWER6_MMCRA_SIHV) + return PERF_RECORD_MISC_HYPERVISOR; + return (mmcra & POWER6_MMCRA_SIPR) ? + PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL; + } + if (mmcra & MMCRA_SIHV) + return PERF_RECORD_MISC_HYPERVISOR; + return (mmcra & MMCRA_SIPR) ? PERF_RECORD_MISC_USER : + PERF_RECORD_MISC_KERNEL; +} + +/* + * Overload regs->dsisr to store MMCRA so we only need to read it once + * on each interrupt. + */ +static inline void perf_read_regs(struct pt_regs *regs) +{ + regs->dsisr = mfspr(SPRN_MMCRA); +} + +/* + * If interrupts were soft-disabled when a PMU interrupt occurs, treat + * it as an NMI. + */ +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ + return !regs->softe; +} + +#endif /* CONFIG_PPC64 */ + +static void perf_event_interrupt(struct pt_regs *regs); + +void perf_event_print_debug(void) +{ +} + +/* + * Read one performance monitor event (PMC). + */ +static unsigned long read_pmc(int idx) +{ + unsigned long val; + + switch (idx) { + case 1: + val = mfspr(SPRN_PMC1); + break; + case 2: + val = mfspr(SPRN_PMC2); + break; + case 3: + val = mfspr(SPRN_PMC3); + break; + case 4: + val = mfspr(SPRN_PMC4); + break; + case 5: + val = mfspr(SPRN_PMC5); + break; + case 6: + val = mfspr(SPRN_PMC6); + break; +#ifdef CONFIG_PPC64 + case 7: + val = mfspr(SPRN_PMC7); + break; + case 8: + val = mfspr(SPRN_PMC8); + break; +#endif /* CONFIG_PPC64 */ + default: + printk(KERN_ERR "oops trying to read PMC%d\n", idx); + val = 0; + } + return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ + switch (idx) { + case 1: + mtspr(SPRN_PMC1, val); + break; + case 2: + mtspr(SPRN_PMC2, val); + break; + case 3: + mtspr(SPRN_PMC3, val); + break; + case 4: + mtspr(SPRN_PMC4, val); + break; + case 5: + mtspr(SPRN_PMC5, val); + break; + case 6: + mtspr(SPRN_PMC6, val); + break; +#ifdef CONFIG_PPC64 + case 7: + mtspr(SPRN_PMC7, val); + break; + case 8: + mtspr(SPRN_PMC8, val); + break; +#endif /* CONFIG_PPC64 */ + default: + printk(KERN_ERR "oops trying to write PMC%d\n", idx); + } +} + +/* + * Check if a set of events can all go on the PMU at once. + * If they can't, this will look at alternative codes for the events + * and see if any combination of alternative codes is feasible. + * The feasible set is returned in event_id[]. + */ +static int power_check_constraints(struct cpu_hw_events *cpuhw, + u64 event_id[], unsigned int cflags[], + int n_ev) +{ + unsigned long mask, value, nv; + unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS]; + int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS]; + int i, j; + unsigned long addf = ppmu->add_fields; + unsigned long tadd = ppmu->test_adder; + + if (n_ev > ppmu->n_event) + return -1; + + /* First see if the events will go on as-is */ + for (i = 0; i < n_ev; ++i) { + if ((cflags[i] & PPMU_LIMITED_PMC_REQD) + && !ppmu->limited_pmc_event(event_id[i])) { + ppmu->get_alternatives(event_id[i], cflags[i], + cpuhw->alternatives[i]); + event_id[i] = cpuhw->alternatives[i][0]; + } + if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0], + &cpuhw->avalues[i][0])) + return -1; + } + value = mask = 0; + for (i = 0; i < n_ev; ++i) { + nv = (value | cpuhw->avalues[i][0]) + + (value & cpuhw->avalues[i][0] & addf); + if ((((nv + tadd) ^ value) & mask) != 0 || + (((nv + tadd) ^ cpuhw->avalues[i][0]) & + cpuhw->amasks[i][0]) != 0) + break; + value = nv; + mask |= cpuhw->amasks[i][0]; + } + if (i == n_ev) + return 0; /* all OK */ + + /* doesn't work, gather alternatives... */ + if (!ppmu->get_alternatives) + return -1; + for (i = 0; i < n_ev; ++i) { + choice[i] = 0; + n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i], + cpuhw->alternatives[i]); + for (j = 1; j < n_alt[i]; ++j) + ppmu->get_constraint(cpuhw->alternatives[i][j], + &cpuhw->amasks[i][j], + &cpuhw->avalues[i][j]); + } + + /* enumerate all possibilities and see if any will work */ + i = 0; + j = -1; + value = mask = nv = 0; + while (i < n_ev) { + if (j >= 0) { + /* we're backtracking, restore context */ + value = svalues[i]; + mask = smasks[i]; + j = choice[i]; + } + /* + * See if any alternative k for event_id i, + * where k > j, will satisfy the constraints. + */ + while (++j < n_alt[i]) { + nv = (value | cpuhw->avalues[i][j]) + + (value & cpuhw->avalues[i][j] & addf); + if ((((nv + tadd) ^ value) & mask) == 0 && + (((nv + tadd) ^ cpuhw->avalues[i][j]) + & cpuhw->amasks[i][j]) == 0) + break; + } + if (j >= n_alt[i]) { + /* + * No feasible alternative, backtrack + * to event_id i-1 and continue enumerating its + * alternatives from where we got up to. + */ + if (--i < 0) + return -1; + } else { + /* + * Found a feasible alternative for event_id i, + * remember where we got up to with this event_id, + * go on to the next event_id, and start with + * the first alternative for it. + */ + choice[i] = j; + svalues[i] = value; + smasks[i] = mask; + value = nv; + mask |= cpuhw->amasks[i][j]; + ++i; + j = -1; + } + } + + /* OK, we have a feasible combination, tell the caller the solution */ + for (i = 0; i < n_ev; ++i) + event_id[i] = cpuhw->alternatives[i][choice[i]]; + return 0; +} + +/* + * Check if newly-added events have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added events. + */ +static int check_excludes(struct perf_event **ctrs, unsigned int cflags[], + int n_prev, int n_new) +{ + int eu = 0, ek = 0, eh = 0; + int i, n, first; + struct perf_event *event; + + n = n_prev + n_new; + if (n <= 1) + return 0; + + first = 1; + for (i = 0; i < n; ++i) { + if (cflags[i] & PPMU_LIMITED_PMC_OK) { + cflags[i] &= ~PPMU_LIMITED_PMC_REQD; + continue; + } + event = ctrs[i]; + if (first) { + eu = event->attr.exclude_user; + ek = event->attr.exclude_kernel; + eh = event->attr.exclude_hv; + first = 0; + } else if (event->attr.exclude_user != eu || + event->attr.exclude_kernel != ek || + event->attr.exclude_hv != eh) { + return -EAGAIN; + } + } + + if (eu || ek || eh) + for (i = 0; i < n; ++i) + if (cflags[i] & PPMU_LIMITED_PMC_OK) + cflags[i] |= PPMU_LIMITED_PMC_REQD; + + return 0; +} + +static void power_pmu_read(struct perf_event *event) +{ + s64 val, delta, prev; + + if (!event->hw.idx) + return; + /* + * Performance monitor interrupts come even when interrupts + * are soft-disabled, as long as interrupts are hard-enabled. + * Therefore we treat them like NMIs. + */ + do { + prev = atomic64_read(&event->hw.prev_count); + barrier(); + val = read_pmc(event->hw.idx); + } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + + /* The events are only 32 bits wide */ + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + atomic64_sub(delta, &event->hw.period_left); +} + +/* + * On some machines, PMC5 and PMC6 can't be written, don't respect + * the freeze conditions, and don't generate interrupts. This tells + * us if `event' is using such a PMC. + */ +static int is_limited_pmc(int pmcnum) +{ + return (ppmu->flags & PPMU_LIMITED_PMC5_6) + && (pmcnum == 5 || pmcnum == 6); +} + +static void freeze_limited_events(struct cpu_hw_events *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_event *event; + u64 val, prev, delta; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + event = cpuhw->limited_event[i]; + if (!event->hw.idx) + continue; + val = (event->hw.idx == 5) ? pmc5 : pmc6; + prev = atomic64_read(&event->hw.prev_count); + event->hw.idx = 0; + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + } +} + +static void thaw_limited_events(struct cpu_hw_events *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_event *event; + u64 val; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + event = cpuhw->limited_event[i]; + event->hw.idx = cpuhw->limited_hwidx[i]; + val = (event->hw.idx == 5) ? pmc5 : pmc6; + atomic64_set(&event->hw.prev_count, val); + perf_event_update_userpage(event); + } +} + +/* + * Since limited events don't respect the freeze conditions, we + * have to read them immediately after freezing or unfreezing the + * other events. We try to keep the values from the limited + * events as consistent as possible by keeping the delay (in + * cycles and instructions) between freezing/unfreezing and reading + * the limited events as small and consistent as possible. + * Therefore, if any limited events are in use, we read them + * both, and always in the same order, to minimize variability, + * and do it inside the same asm that writes MMCR0. + */ +static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) +{ + unsigned long pmc5, pmc6; + + if (!cpuhw->n_limited) { + mtspr(SPRN_MMCR0, mmcr0); + return; + } + + /* + * Write MMCR0, then read PMC5 and PMC6 immediately. + * To ensure we don't get a performance monitor interrupt + * between writing MMCR0 and freezing/thawing the limited + * events, we first write MMCR0 with the event overflow + * interrupt enable bits turned off. + */ + asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" + : "=&r" (pmc5), "=&r" (pmc6) + : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), + "i" (SPRN_MMCR0), + "i" (SPRN_PMC5), "i" (SPRN_PMC6)); + + if (mmcr0 & MMCR0_FC) + freeze_limited_events(cpuhw, pmc5, pmc6); + else + thaw_limited_events(cpuhw, pmc5, pmc6); + + /* + * Write the full MMCR0 including the event overflow interrupt + * enable bits, if necessary. + */ + if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) + mtspr(SPRN_MMCR0, mmcr0); +} + +/* + * Disable all events to prevent PMU interrupts and to allow + * events to be added or removed. + */ +void hw_perf_disable(void) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + + if (!ppmu) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + + if (!cpuhw->disabled) { + cpuhw->disabled = 1; + cpuhw->n_added = 0; + + /* + * Check if we ever enabled the PMU on this cpu. + */ + if (!cpuhw->pmcs_enabled) { + ppc_enable_pmcs(); + cpuhw->pmcs_enabled = 1; + } + + /* + * Disable instruction sampling if it was enabled + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mtspr(SPRN_MMCRA, + cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mb(); + } + + /* + * Set the 'freeze events' bit. + * The barrier is to make sure the mtspr has been + * executed and the PMU has frozen the events + * before we return. + */ + write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); + mb(); + } + local_irq_restore(flags); +} + +/* + * Re-enable all events if disable == 0. + * If we were previously disabled and events were added, then + * put the new config on the PMU. + */ +void hw_perf_enable(void) +{ + struct perf_event *event; + struct cpu_hw_events *cpuhw; + unsigned long flags; + long i; + unsigned long val; + s64 left; + unsigned int hwc_index[MAX_HWEVENTS]; + int n_lim; + int idx; + + if (!ppmu) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + if (!cpuhw->disabled) { + local_irq_restore(flags); + return; + } + cpuhw->disabled = 0; + + /* + * If we didn't change anything, or only removed events, + * no need to recalculate MMCR* settings and reset the PMCs. + * Just reenable the PMU with the current MMCR* settings + * (possibly updated for removal of events). + */ + if (!cpuhw->n_added) { + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + if (cpuhw->n_events == 0) + ppc_set_pmu_inuse(0); + goto out_enable; + } + + /* + * Compute MMCR* values for the new set of events + */ + if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index, + cpuhw->mmcr)) { + /* shouldn't ever get here */ + printk(KERN_ERR "oops compute_mmcr failed\n"); + goto out; + } + + /* + * Add in MMCR0 freeze bits corresponding to the + * attr.exclude_* bits for the first event. + * We have already checked that all events have the + * same values for these bits as the first event. + */ + event = cpuhw->event[0]; + if (event->attr.exclude_user) + cpuhw->mmcr[0] |= MMCR0_FCP; + if (event->attr.exclude_kernel) + cpuhw->mmcr[0] |= freeze_events_kernel; + if (event->attr.exclude_hv) + cpuhw->mmcr[0] |= MMCR0_FCHV; + + /* + * Write the new configuration to MMCR* with the freeze + * bit set and set the hardware events to their initial values. + * Then unfreeze the events. + */ + ppc_set_pmu_inuse(1); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) + | MMCR0_FC); + + /* + * Read off any pre-existing events that need to move + * to another PMC. + */ + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) { + power_pmu_read(event); + write_pmc(event->hw.idx, 0); + event->hw.idx = 0; + } + } + + /* + * Initialize the PMCs for all the new and moved events. + */ + cpuhw->n_limited = n_lim = 0; + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (event->hw.idx) + continue; + idx = hwc_index[i] + 1; + if (is_limited_pmc(idx)) { + cpuhw->limited_event[n_lim] = event; + cpuhw->limited_hwidx[n_lim] = idx; + ++n_lim; + continue; + } + val = 0; + if (event->hw.sample_period) { + left = atomic64_read(&event->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + atomic64_set(&event->hw.prev_count, val); + event->hw.idx = idx; + write_pmc(idx, val); + perf_event_update_userpage(event); + } + cpuhw->n_limited = n_lim; + cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + + out_enable: + mb(); + write_mmcr0(cpuhw, cpuhw->mmcr[0]); + + /* + * Enable instruction sampling if necessary + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mb(); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + } + + out: + local_irq_restore(flags); +} + +static int collect_events(struct perf_event *group, int max_count, + struct perf_event *ctrs[], u64 *events, + unsigned int *flags) +{ + int n = 0; + struct perf_event *event; + + if (!is_software_event(group)) { + if (n >= max_count) + return -1; + ctrs[n] = group; + flags[n] = group->hw.event_base; + events[n++] = group->hw.config; + } + list_for_each_entry(event, &group->sibling_list, list_entry) { + if (!is_software_event(event) && + event->state != PERF_EVENT_STATE_OFF) { + if (n >= max_count) + return -1; + ctrs[n] = event; + flags[n] = event->hw.event_base; + events[n++] = event->hw.config; + } + } + return n; +} + +static void event_sched_in(struct perf_event *event, int cpu) +{ + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; + event->tstamp_running += event->ctx->time - event->tstamp_stopped; + if (is_software_event(event)) + event->pmu->enable(event); +} + +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + */ +int hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + struct cpu_hw_events *cpuhw; + long i, n, n0; + struct perf_event *sub; + + if (!ppmu) + return 0; + cpuhw = &__get_cpu_var(cpu_hw_events); + n0 = cpuhw->n_events; + n = collect_events(group_leader, ppmu->n_event - n0, + &cpuhw->event[n0], &cpuhw->events[n0], + &cpuhw->flags[n0]); + if (n < 0) + return -EAGAIN; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, n)) + return -EAGAIN; + i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); + if (i < 0) + return -EAGAIN; + cpuhw->n_events = n0 + n; + cpuhw->n_added += n; + + /* + * OK, this group can go on; update event states etc., + * and enable any software events + */ + for (i = n0; i < n0 + n; ++i) + cpuhw->event[i]->hw.config = cpuhw->events[i]; + cpuctx->active_oncpu += n; + n = 1; + event_sched_in(group_leader, cpu); + list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { + if (sub->state != PERF_EVENT_STATE_OFF) { + event_sched_in(sub, cpu); + ++n; + } + } + ctx->nr_active += n; + + return 1; +} + +/* + * Add a event to the PMU. + * If all events are not already frozen, then we disable and + * re-enable the PMU in order to get hw_perf_enable to do the + * actual work of reconfiguring the PMU. + */ +static int power_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + int n0; + int ret = -EAGAIN; + + local_irq_save(flags); + perf_disable(); + + /* + * Add the event to the list (if there is room) + * and check whether the total set is still feasible. + */ + cpuhw = &__get_cpu_var(cpu_hw_events); + n0 = cpuhw->n_events; + if (n0 >= ppmu->n_event) + goto out; + cpuhw->event[n0] = event; + cpuhw->events[n0] = event->hw.config; + cpuhw->flags[n0] = event->hw.event_base; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) + goto out; + if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) + goto out; + + event->hw.config = cpuhw->events[n0]; + ++cpuhw->n_events; + ++cpuhw->n_added; + + ret = 0; + out: + perf_enable(); + local_irq_restore(flags); + return ret; +} + +/* + * Remove a event from the PMU. + */ +static void power_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuhw; + long i; + unsigned long flags; + + local_irq_save(flags); + perf_disable(); + + power_pmu_read(event); + + cpuhw = &__get_cpu_var(cpu_hw_events); + for (i = 0; i < cpuhw->n_events; ++i) { + if (event == cpuhw->event[i]) { + while (++i < cpuhw->n_events) + cpuhw->event[i-1] = cpuhw->event[i]; + --cpuhw->n_events; + ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr); + if (event->hw.idx) { + write_pmc(event->hw.idx, 0); + event->hw.idx = 0; + } + perf_event_update_userpage(event); + break; + } + } + for (i = 0; i < cpuhw->n_limited; ++i) + if (event == cpuhw->limited_event[i]) + break; + if (i < cpuhw->n_limited) { + while (++i < cpuhw->n_limited) { + cpuhw->limited_event[i-1] = cpuhw->limited_event[i]; + cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; + } + --cpuhw->n_limited; + } + if (cpuhw->n_events == 0) { + /* disable exceptions if no events are running */ + cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); + } + + perf_enable(); + local_irq_restore(flags); +} + +/* + * Re-enable interrupts on a event after they were throttled + * because they were coming too fast. + */ +static void power_pmu_unthrottle(struct perf_event *event) +{ + s64 val, left; + unsigned long flags; + + if (!event->hw.idx || !event->hw.sample_period) + return; + local_irq_save(flags); + perf_disable(); + power_pmu_read(event); + left = event->hw.sample_period; + event->hw.last_period = left; + val = 0; + if (left < 0x80000000L) + val = 0x80000000L - left; + write_pmc(event->hw.idx, val); + atomic64_set(&event->hw.prev_count, val); + atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + perf_enable(); + local_irq_restore(flags); +} + +struct pmu power_pmu = { + .enable = power_pmu_enable, + .disable = power_pmu_disable, + .read = power_pmu_read, + .unthrottle = power_pmu_unthrottle, +}; + +/* + * Return 1 if we might be able to put event on a limited PMC, + * or 0 if not. + * A event can only go on a limited PMC if it counts something + * that a limited PMC can count, doesn't require interrupts, and + * doesn't exclude any processor mode. + */ +static int can_go_on_limited_pmc(struct perf_event *event, u64 ev, + unsigned int flags) +{ + int n; + u64 alt[MAX_EVENT_ALTERNATIVES]; + + if (event->attr.exclude_user + || event->attr.exclude_kernel + || event->attr.exclude_hv + || event->attr.sample_period) + return 0; + + if (ppmu->limited_pmc_event(ev)) + return 1; + + /* + * The requested event_id isn't on a limited PMC already; + * see if any alternative code goes on a limited PMC. + */ + if (!ppmu->get_alternatives) + return 0; + + flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; + n = ppmu->get_alternatives(ev, flags, alt); + + return n > 0; +} + +/* + * Find an alternative event_id that goes on a normal PMC, if possible, + * and return the event_id code, or 0 if there is no such alternative. + * (Note: event_id code 0 is "don't count" on all machines.) + */ +static u64 normal_pmc_alternative(u64 ev, unsigned long flags) +{ + u64 alt[MAX_EVENT_ALTERNATIVES]; + int n; + + flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); + n = ppmu->get_alternatives(ev, flags, alt); + if (!n) + return 0; + return alt[0]; +} + +/* Number of perf_events counting hardware events */ +static atomic_t num_events; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Release the PMU if this is the last perf_event. + */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +/* + * Translate a generic cache event_id config to a raw event_id code. + */ +static int hw_perf_cache_event(u64 config, u64 *eventp) +{ + unsigned long type, op, result; + int ev; + + if (!ppmu->cache_events) + return -EINVAL; + + /* unpack config */ + type = config & 0xff; + op = (config >> 8) & 0xff; + result = (config >> 16) & 0xff; + + if (type >= PERF_COUNT_HW_CACHE_MAX || + op >= PERF_COUNT_HW_CACHE_OP_MAX || + result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ev = (*ppmu->cache_events)[type][op][result]; + if (ev == 0) + return -EOPNOTSUPP; + if (ev == -1) + return -EINVAL; + *eventp = ev; + return 0; +} + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + u64 ev; + unsigned long flags; + struct perf_event *ctrs[MAX_HWEVENTS]; + u64 events[MAX_HWEVENTS]; + unsigned int cflags[MAX_HWEVENTS]; + int n; + int err; + struct cpu_hw_events *cpuhw; + + if (!ppmu) + return ERR_PTR(-ENXIO); + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + ev = event->attr.config; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) + return ERR_PTR(-EOPNOTSUPP); + ev = ppmu->generic_events[ev]; + break; + case PERF_TYPE_HW_CACHE: + err = hw_perf_cache_event(event->attr.config, &ev); + if (err) + return ERR_PTR(err); + break; + case PERF_TYPE_RAW: + ev = event->attr.config; + break; + default: + return ERR_PTR(-EINVAL); + } + event->hw.config_base = ev; + event->hw.idx = 0; + + /* + * If we are not running on a hypervisor, force the + * exclude_hv bit to 0 so that we don't care what + * the user set it to. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + event->attr.exclude_hv = 0; + + /* + * If this is a per-task event, then we can use + * PM_RUN_* events interchangeably with their non RUN_* + * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. + * XXX we should check if the task is an idle task. + */ + flags = 0; + if (event->ctx->task) + flags |= PPMU_ONLY_COUNT_RUN; + + /* + * If this machine has limited events, check whether this + * event_id could go on a limited event. + */ + if (ppmu->flags & PPMU_LIMITED_PMC5_6) { + if (can_go_on_limited_pmc(event, ev, flags)) { + flags |= PPMU_LIMITED_PMC_OK; + } else if (ppmu->limited_pmc_event(ev)) { + /* + * The requested event_id is on a limited PMC, + * but we can't use a limited PMC; see if any + * alternative goes on a normal PMC. + */ + ev = normal_pmc_alternative(ev, flags); + if (!ev) + return ERR_PTR(-EINVAL); + } + } + + /* + * If this is in a group, check if it can go on with all the + * other hardware events in the group. We assume the event + * hasn't been linked into its leader's sibling list at this point. + */ + n = 0; + if (event->group_leader != event) { + n = collect_events(event->group_leader, ppmu->n_event - 1, + ctrs, events, cflags); + if (n < 0) + return ERR_PTR(-EINVAL); + } + events[n] = ev; + ctrs[n] = event; + cflags[n] = flags; + if (check_excludes(ctrs, cflags, n, 1)) + return ERR_PTR(-EINVAL); + + cpuhw = &get_cpu_var(cpu_hw_events); + err = power_check_constraints(cpuhw, events, cflags, n + 1); + put_cpu_var(cpu_hw_events); + if (err) + return ERR_PTR(-EINVAL); + + event->hw.config = events[n]; + event->hw.event_base = cflags[n]; + event->hw.last_period = event->hw.sample_period; + atomic64_set(&event->hw.period_left, event->hw.last_period); + + /* + * See if we need to reserve the PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * reserve_pmc_hardware or release_pmc_hardware. + */ + err = 0; + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && + reserve_pmc_hardware(perf_event_interrupt)) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + } + event->destroy = hw_perf_event_destroy; + + if (err) + return ERR_PTR(err); + return &power_pmu; +} + +/* + * A event has overflowed; update its count and record + * things if requested. Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_event *event, unsigned long val, + struct pt_regs *regs, int nmi) +{ + u64 period = event->hw.sample_period; + s64 prev, delta, left; + int record = 0; + + /* we don't have to worry about interrupts here */ + prev = atomic64_read(&event->hw.prev_count); + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + + /* + * See if the total period for this event has expired, + * and update for the next period. + */ + val = 0; + left = atomic64_read(&event->hw.period_left) - delta; + if (period) { + if (left <= 0) { + left += period; + if (left <= 0) + left = period; + record = 1; + } + if (left < 0x80000000LL) + val = 0x80000000LL - left; + } + + /* + * Finally record data if requested. + */ + if (record) { + struct perf_sample_data data = { + .addr = 0, + .period = event->hw.last_period, + }; + + if (event->attr.sample_type & PERF_SAMPLE_ADDR) + perf_get_data_addr(regs, &data.addr); + + if (perf_event_overflow(event, nmi, &data, regs)) { + /* + * Interrupts are coming too fast - throttle them + * by setting the event to 0, so it will be + * at least 2^30 cycles until the next interrupt + * (assuming each event counts at most 2 counts + * per cycle). + */ + val = 0; + left = ~0ULL >> 1; + } + } + + write_pmc(event->hw.idx, val); + atomic64_set(&event->hw.prev_count, val); + atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); +} + +/* + * Called from generic code to get the misc flags (i.e. processor mode) + * for an event_id. + */ +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + u32 flags = perf_get_misc_flags(regs); + + if (flags) + return flags; + return user_mode(regs) ? PERF_RECORD_MISC_USER : + PERF_RECORD_MISC_KERNEL; +} + +/* + * Called from generic code to get the instruction pointer + * for an event_id. + */ +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + unsigned long ip; + + if (TRAP(regs) != 0xf00) + return regs->nip; /* not a PMU interrupt */ + + ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); + return ip; +} + +/* + * Performance monitor interrupt stuff + */ +static void perf_event_interrupt(struct pt_regs *regs) +{ + int i; + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + unsigned long val; + int found = 0; + int nmi; + + if (cpuhw->n_limited) + freeze_limited_events(cpuhw, mfspr(SPRN_PMC5), + mfspr(SPRN_PMC6)); + + perf_read_regs(regs); + + nmi = perf_intr_is_nmi(regs); + if (nmi) + nmi_enter(); + else + irq_enter(); + + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (!event->hw.idx || is_limited_pmc(event->hw.idx)) + continue; + val = read_pmc(event->hw.idx); + if ((int)val < 0) { + /* event has overflowed */ + found = 1; + record_and_restart(event, val, regs, nmi); + } + } + + /* + * In case we didn't find and reset the event that caused + * the interrupt, scan all events and reset any that are + * negative, to avoid getting continual interrupts. + * Any that we processed in the previous loop will not be negative. + */ + if (!found) { + for (i = 0; i < ppmu->n_event; ++i) { + if (is_limited_pmc(i + 1)) + continue; + val = read_pmc(i + 1); + if ((int)val < 0) + write_pmc(i + 1, 0); + } + } + + /* + * Reset MMCR0 to its normal value. This will set PMXE and + * clear FC (freeze events) and PMAO (perf mon alert occurred) + * and thus allow interrupts to occur again. + * XXX might want to use MSR.PM to keep the events frozen until + * we get back out of this interrupt. + */ + write_mmcr0(cpuhw, cpuhw->mmcr[0]); + + if (nmi) + nmi_exit(); + else + irq_exit(); +} + +void hw_perf_event_setup(int cpu) +{ + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + if (!ppmu) + return; + memset(cpuhw, 0, sizeof(*cpuhw)); + cpuhw->mmcr[0] = MMCR0_FC; +} + +int register_power_pmu(struct power_pmu *pmu) +{ + if (ppmu) + return -EBUSY; /* something's already registered */ + + ppmu = pmu; + pr_info("%s performance monitor hardware support registered\n", + pmu->name); + +#ifdef MSR_HV + /* + * Use FCHV to ignore kernel events if MSR.HV is set. + */ + if (mfmsr() & MSR_HV) + freeze_events_kernel = MMCR0_FCHV; +#endif /* CONFIG_PPC64 */ + + return 0; +} diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 3c90a3d9173..2a361cdda63 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 31918af3e35..0f4c1c73a6a 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 867f6f66396..c351b3a57fb 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index fa21890531d..ca399ba5034 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index 018d094d92f..28a4daacdc0 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 75dccb71a04..479574413a9 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 465e498bcb3..df45a7449a6 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -527,25 +527,25 @@ void __init iSeries_time_init_early(void) } #endif /* CONFIG_PPC_ISERIES */ -#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32) -DEFINE_PER_CPU(u8, perf_counter_pending); +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32) +DEFINE_PER_CPU(u8, perf_event_pending); -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { - get_cpu_var(perf_counter_pending) = 1; + get_cpu_var(perf_event_pending) = 1; set_dec(1); - put_cpu_var(perf_counter_pending); + put_cpu_var(perf_event_pending); } -#define test_perf_counter_pending() __get_cpu_var(perf_counter_pending) -#define clear_perf_counter_pending() __get_cpu_var(perf_counter_pending) = 0 +#define test_perf_event_pending() __get_cpu_var(perf_event_pending) +#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 -#else /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ +#else /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ -#define test_perf_counter_pending() 0 -#define clear_perf_counter_pending() +#define test_perf_event_pending() 0 +#define clear_perf_event_pending() -#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ +#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ /* * For iSeries shared processors, we have to let the hypervisor @@ -573,9 +573,9 @@ void timer_interrupt(struct pt_regs * regs) set_dec(DECREMENTER_MAX); #ifdef CONFIG_PPC32 - if (test_perf_counter_pending()) { - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (test_perf_event_pending()) { + clear_perf_event_pending(); + perf_event_do_pending(); } if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 830bef0a113..e7dae82c128 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -323,7 +323,7 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9efc8bda01b..e382cae678b 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -280,9 +280,9 @@ config PPC_HAVE_PMU_SUPPORT config PPC_PERF_CTRS def_bool y - depends on PERF_COUNTERS && PPC_HAVE_PMU_SUPPORT + depends on PERF_EVENTS && PPC_HAVE_PMU_SUPPORT help - This enables the powerpc-specific perf_counter back-end. + This enables the powerpc-specific perf_event back-end. config SMP depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 1c866efd217..43c0acad716 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -94,7 +94,7 @@ config S390 select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK select INIT_ALL_POSSIBLE - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config SCHED_OMIT_FRAME_POINTER bool diff --git a/arch/s390/include/asm/perf_counter.h b/arch/s390/include/asm/perf_counter.h deleted file mode 100644 index 7015188c2cc..00000000000 --- a/arch/s390/include/asm/perf_counter.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Performance counter support - s390 specific definitions. - * - * Copyright 2009 Martin Schwidefsky, IBM Corporation. - */ - -static inline void set_perf_counter_pending(void) {} -static inline void clear_perf_counter_pending(void) {} - -#define PERF_COUNTER_INDEX_OFFSET 0 diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h new file mode 100644 index 00000000000..3840cbe7763 --- /dev/null +++ b/arch/s390/include/asm/perf_event.h @@ -0,0 +1,10 @@ +/* + * Performance event support - s390 specific definitions. + * + * Copyright 2009 Martin Schwidefsky, IBM Corporation. + */ + +static inline void set_perf_event_pending(void) {} +static inline void clear_perf_event_pending(void) {} + +#define PERF_EVENT_INDEX_OFFSET 0 diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index c80602d7c88..cb5232df151 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -268,7 +268,7 @@ #define __NR_preadv 328 #define __NR_pwritev 329 #define __NR_rt_tgsigqueueinfo 330 -#define __NR_perf_counter_open 331 +#define __NR_perf_event_open 331 #define NR_syscalls 332 /* diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 88a83366819..624790042d4 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -1832,11 +1832,11 @@ compat_sys_rt_tgsigqueueinfo_wrapper: llgtr %r5,%r5 # struct compat_siginfo * jg compat_sys_rt_tgsigqueueinfo_wrapper # branch to system call - .globl sys_perf_counter_open_wrapper -sys_perf_counter_open_wrapper: - llgtr %r2,%r2 # const struct perf_counter_attr * + .globl sys_perf_event_open_wrapper +sys_perf_event_open_wrapper: + llgtr %r2,%r2 # const struct perf_event_attr * lgfr %r3,%r3 # pid_t lgfr %r4,%r4 # int lgfr %r5,%r5 # int llgfr %r6,%r6 # unsigned long - jg sys_perf_counter_open # branch to system call + jg sys_perf_event_open # branch to system call diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index ad1acd20038..0b5083681e7 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -339,4 +339,4 @@ SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper) SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper) SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper) SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */ -SYSCALL(sys_perf_counter_open,sys_perf_counter_open,sys_perf_counter_open_wrapper) +SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 1abbadd497e..6d507462967 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -10,7 +10,7 @@ * Copyright (C) 1995 Linus Torvalds */ -#include +#include #include #include #include @@ -306,7 +306,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) * interrupts again and then search the VMAs */ local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); down_read(&mm->mmap_sem); si_code = SEGV_MAPERR; @@ -366,11 +366,11 @@ good_area: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 4df3570fe51..b940424f8cc 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,7 +16,7 @@ config SUPERH select HAVE_IOREMAP_PROT if MMU select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sh/include/asm/perf_counter.h b/arch/sh/include/asm/perf_counter.h deleted file mode 100644 index d8e6bb9c0cc..00000000000 --- a/arch/sh/include/asm/perf_counter.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef __ASM_SH_PERF_COUNTER_H -#define __ASM_SH_PERF_COUNTER_H - -/* SH only supports software counters through this interface. */ -static inline void set_perf_counter_pending(void) {} - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#endif /* __ASM_SH_PERF_COUNTER_H */ diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h new file mode 100644 index 00000000000..11a302297ab --- /dev/null +++ b/arch/sh/include/asm/perf_event.h @@ -0,0 +1,9 @@ +#ifndef __ASM_SH_PERF_EVENT_H +#define __ASM_SH_PERF_EVENT_H + +/* SH only supports software events through this interface. */ +static inline void set_perf_event_pending(void) {} + +#define PERF_EVENT_INDEX_OFFSET 0 + +#endif /* __ASM_SH_PERF_EVENT_H */ diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h index 925dd40d9d5..f3fd1b9eb6b 100644 --- a/arch/sh/include/asm/unistd_32.h +++ b/arch/sh/include/asm/unistd_32.h @@ -344,7 +344,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #define NR_syscalls 337 diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h index 2b84bc916bc..343ce8f073e 100644 --- a/arch/sh/include/asm/unistd_64.h +++ b/arch/sh/include/asm/unistd_64.h @@ -384,7 +384,7 @@ #define __NR_preadv 361 #define __NR_pwritev 362 #define __NR_rt_tgsigqueueinfo 363 -#define __NR_perf_counter_open 364 +#define __NR_perf_event_open 364 #ifdef __KERNEL__ diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 16ba225ede8..19fd11dd987 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -352,4 +352,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S index af6fb7410c2..5bfde6c7749 100644 --- a/arch/sh/kernel/syscalls_64.S +++ b/arch/sh/kernel/syscalls_64.S @@ -390,4 +390,4 @@ sys_call_table: .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index 781b413ff82..47530104e0a 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -157,7 +157,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if ((regs->sr & SR_IMASK) != SR_IMASK) local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -208,11 +208,11 @@ survive: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index 2dcc48528f7..de0b0e88182 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess, /* Not an IO address, so reenable interrupts */ local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt or have no user @@ -201,11 +201,11 @@ survive: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 86b82348b97..97fca4695e0 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -25,7 +25,7 @@ config SPARC select ARCH_WANT_OPTIONAL_GPIOLIB select RTC_CLASS select RTC_DRV_M48T59 - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG @@ -47,7 +47,7 @@ config SPARC64 select RTC_DRV_BQ4802 select RTC_DRV_SUN4V select RTC_DRV_STARFIRE - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h deleted file mode 100644 index 5d7a8ca0e49..00000000000 --- a/arch/sparc/include/asm/perf_counter.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __ASM_SPARC_PERF_COUNTER_H -#define __ASM_SPARC_PERF_COUNTER_H - -extern void set_perf_counter_pending(void); - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -#else -static inline void init_hw_perf_counters(void) { } -#endif - -#endif diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h new file mode 100644 index 00000000000..7e2669894ce --- /dev/null +++ b/arch/sparc/include/asm/perf_event.h @@ -0,0 +1,14 @@ +#ifndef __ASM_SPARC_PERF_EVENT_H +#define __ASM_SPARC_PERF_EVENT_H + +extern void set_perf_event_pending(void); + +#define PERF_EVENT_INDEX_OFFSET 0 + +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +#else +static inline void init_hw_perf_events(void) { } +#endif + +#endif diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index 706df669f3b..42f2316c3ea 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -395,7 +395,7 @@ #define __NR_preadv 324 #define __NR_pwritev 325 #define __NR_rt_tgsigqueueinfo 326 -#define __NR_perf_counter_open 327 +#define __NR_perf_event_open 327 #define NR_SYSCALLS 328 diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 247cc620cee..3a048fad7ee 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -104,5 +104,5 @@ obj-$(CONFIG_AUDIT) += audit.o audit--$(CONFIG_AUDIT) := compat_audit.o obj-$(CONFIG_COMPAT) += $(audit--y) -pc--$(CONFIG_PERF_COUNTERS) := perf_counter.o +pc--$(CONFIG_PERF_EVENTS) := perf_event.o obj-$(CONFIG_SPARC64) += $(pc--y) diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 378eb53e077..b129611590a 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -265,7 +265,7 @@ int __init nmi_init(void) } } if (!err) - init_hw_perf_counters(); + init_hw_perf_events(); return err; } diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c index 68ff0010707..2d94e7a03af 100644 --- a/arch/sparc/kernel/pcr.c +++ b/arch/sparc/kernel/pcr.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -15,7 +15,7 @@ /* This code is shared between various users of the performance * counters. Users will be oprofile, pseudo-NMI watchdog, and the - * perf_counter support layer. + * perf_event support layer. */ #define PCR_SUN4U_ENABLE (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE) @@ -42,14 +42,14 @@ void deferred_pcr_work_irq(int irq, struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); -#ifdef CONFIG_PERF_COUNTERS - perf_counter_do_pending(); +#ifdef CONFIG_PERF_EVENTS + perf_event_do_pending(); #endif irq_exit(); set_irq_regs(old_regs); } -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { set_softint(1 << PIL_DEFERRED_PCR_WORK); } diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c deleted file mode 100644 index b1265ce8a05..00000000000 --- a/arch/sparc/kernel/perf_counter.c +++ /dev/null @@ -1,556 +0,0 @@ -/* Performance counter support for sparc64. - * - * Copyright (C) 2009 David S. Miller - * - * This code is based almost entirely upon the x86 perf counter - * code, which is: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Sparc64 chips have two performance counters, 32-bits each, with - * overflow interrupts generated on transition from 0xffffffff to 0. - * The counters are accessed in one go using a 64-bit register. - * - * Both counters are controlled using a single control register. The - * only way to stop all sampling is to clear all of the context (user, - * supervisor, hypervisor) sampling enable bits. But these bits apply - * to both counters, thus the two counters can't be enabled/disabled - * individually. - * - * The control register has two event fields, one for each of the two - * counters. It's thus nearly impossible to have one counter going - * while keeping the other one stopped. Therefore it is possible to - * get overflow interrupts for counters not currently "in use" and - * that condition must be checked in the overflow interrupt handler. - * - * So we use a hack, in that we program inactive counters with the - * "sw_count0" and "sw_count1" events. These count how many times - * the instruction "sethi %hi(0xfc000), %g0" is executed. It's an - * unusual way to encode a NOP and therefore will not trigger in - * normal code. - */ - -#define MAX_HWCOUNTERS 2 -#define MAX_PERIOD ((1UL << 32) - 1) - -#define PIC_UPPER_INDEX 0 -#define PIC_LOWER_INDEX 1 - -struct cpu_hw_counters { - struct perf_counter *counters[MAX_HWCOUNTERS]; - unsigned long used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)]; - unsigned long active_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)]; - int enabled; -}; -DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; - -struct perf_event_map { - u16 encoding; - u8 pic_mask; -#define PIC_NONE 0x00 -#define PIC_UPPER 0x01 -#define PIC_LOWER 0x02 -}; - -struct sparc_pmu { - const struct perf_event_map *(*event_map)(int); - int max_events; - int upper_shift; - int lower_shift; - int event_mask; - int hv_bit; - int irq_bit; - int upper_nop; - int lower_nop; -}; - -static const struct perf_event_map ultra3i_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER }, - [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER }, -}; - -static const struct perf_event_map *ultra3i_event_map(int event) -{ - return &ultra3i_perfmon_event_map[event]; -} - -static const struct sparc_pmu ultra3i_pmu = { - .event_map = ultra3i_event_map, - .max_events = ARRAY_SIZE(ultra3i_perfmon_event_map), - .upper_shift = 11, - .lower_shift = 4, - .event_mask = 0x3f, - .upper_nop = 0x1c, - .lower_nop = 0x14, -}; - -static const struct perf_event_map niagara2_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER }, -}; - -static const struct perf_event_map *niagara2_event_map(int event) -{ - return &niagara2_perfmon_event_map[event]; -} - -static const struct sparc_pmu niagara2_pmu = { - .event_map = niagara2_event_map, - .max_events = ARRAY_SIZE(niagara2_perfmon_event_map), - .upper_shift = 19, - .lower_shift = 6, - .event_mask = 0xfff, - .hv_bit = 0x8, - .irq_bit = 0x03, - .upper_nop = 0x220, - .lower_nop = 0x220, -}; - -static const struct sparc_pmu *sparc_pmu __read_mostly; - -static u64 event_encoding(u64 event, int idx) -{ - if (idx == PIC_UPPER_INDEX) - event <<= sparc_pmu->upper_shift; - else - event <<= sparc_pmu->lower_shift; - return event; -} - -static u64 mask_for_index(int idx) -{ - return event_encoding(sparc_pmu->event_mask, idx); -} - -static u64 nop_for_index(int idx) -{ - return event_encoding(idx == PIC_UPPER_INDEX ? - sparc_pmu->upper_nop : - sparc_pmu->lower_nop, idx); -} - -static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc, - int idx) -{ - u64 val, mask = mask_for_index(idx); - - val = pcr_ops->read(); - pcr_ops->write((val & ~mask) | hwc->config); -} - -static inline void sparc_pmu_disable_counter(struct hw_perf_counter *hwc, - int idx) -{ - u64 mask = mask_for_index(idx); - u64 nop = nop_for_index(idx); - u64 val = pcr_ops->read(); - - pcr_ops->write((val & ~mask) | nop); -} - -void hw_perf_enable(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - int i; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - val = pcr_ops->read(); - - for (i = 0; i < MAX_HWCOUNTERS; i++) { - struct perf_counter *cp = cpuc->counters[i]; - struct hw_perf_counter *hwc; - - if (!cp) - continue; - hwc = &cp->hw; - val |= hwc->config_base; - } - - pcr_ops->write(val); -} - -void hw_perf_disable(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - - val = pcr_ops->read(); - val &= ~(PCR_UTRACE | PCR_STRACE | - sparc_pmu->hv_bit | sparc_pmu->irq_bit); - pcr_ops->write(val); -} - -static u32 read_pmc(int idx) -{ - u64 val; - - read_pic(val); - if (idx == PIC_UPPER_INDEX) - val >>= 32; - - return val & 0xffffffff; -} - -static void write_pmc(int idx, u64 val) -{ - u64 shift, mask, pic; - - shift = 0; - if (idx == PIC_UPPER_INDEX) - shift = 32; - - mask = ((u64) 0xffffffff) << shift; - val <<= shift; - - read_pic(pic); - pic &= ~mask; - pic |= val; - write_pic(pic); -} - -static int sparc_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int ret = 0; - - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - - if (unlikely(left <= 0)) { - left += period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - if (left > MAX_PERIOD) - left = MAX_PERIOD; - - atomic64_set(&hwc->prev_count, (u64)-left); - - write_pmc(idx, (u64)(-left) & 0xffffffff); - - perf_counter_update_userpage(counter); - - return ret; -} - -static int sparc_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - sparc_pmu_disable_counter(hwc, idx); - - cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active_mask); - - sparc_perf_counter_set_period(counter, hwc, idx); - sparc_pmu_enable_counter(hwc, idx); - perf_counter_update_userpage(counter); - return 0; -} - -static u64 sparc_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - int shift = 64 - 32; - u64 prev_raw_count, new_raw_count; - s64 delta; - -again: - prev_raw_count = atomic64_read(&hwc->prev_count); - new_raw_count = read_pmc(idx); - - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); - - return new_raw_count; -} - -static void sparc_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - clear_bit(idx, cpuc->active_mask); - sparc_pmu_disable_counter(hwc, idx); - - barrier(); - - sparc_perf_counter_update(counter, hwc, idx); - cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_counter_update_userpage(counter); -} - -static void sparc_pmu_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - sparc_perf_counter_update(counter, hwc, hwc->idx); -} - -static void sparc_pmu_unthrottle(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - sparc_pmu_enable_counter(hwc, hwc->idx); -} - -static atomic_t active_counters = ATOMIC_INIT(0); -static DEFINE_MUTEX(pmc_grab_mutex); - -void perf_counter_grab_pmc(void) -{ - if (atomic_inc_not_zero(&active_counters)) - return; - - mutex_lock(&pmc_grab_mutex); - if (atomic_read(&active_counters) == 0) { - if (atomic_read(&nmi_active) > 0) { - on_each_cpu(stop_nmi_watchdog, NULL, 1); - BUG_ON(atomic_read(&nmi_active) != 0); - } - atomic_inc(&active_counters); - } - mutex_unlock(&pmc_grab_mutex); -} - -void perf_counter_release_pmc(void) -{ - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_grab_mutex)) { - if (atomic_read(&nmi_active) == 0) - on_each_cpu(start_nmi_watchdog, NULL, 1); - mutex_unlock(&pmc_grab_mutex); - } -} - -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - perf_counter_release_pmc(); -} - -static int __hw_perf_counter_init(struct perf_counter *counter) -{ - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; - const struct perf_event_map *pmap; - u64 enc; - - if (atomic_read(&nmi_active) < 0) - return -ENODEV; - - if (attr->type != PERF_TYPE_HARDWARE) - return -EOPNOTSUPP; - - if (attr->config >= sparc_pmu->max_events) - return -EINVAL; - - perf_counter_grab_pmc(); - counter->destroy = hw_perf_counter_destroy; - - /* We save the enable bits in the config_base. So to - * turn off sampling just write 'config', and to enable - * things write 'config | config_base'. - */ - hwc->config_base = sparc_pmu->irq_bit; - if (!attr->exclude_user) - hwc->config_base |= PCR_UTRACE; - if (!attr->exclude_kernel) - hwc->config_base |= PCR_STRACE; - if (!attr->exclude_hv) - hwc->config_base |= sparc_pmu->hv_bit; - - if (!hwc->sample_period) { - hwc->sample_period = MAX_PERIOD; - hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); - } - - pmap = sparc_pmu->event_map(attr->config); - - enc = pmap->encoding; - if (pmap->pic_mask & PIC_UPPER) { - hwc->idx = PIC_UPPER_INDEX; - enc <<= sparc_pmu->upper_shift; - } else { - hwc->idx = PIC_LOWER_INDEX; - enc <<= sparc_pmu->lower_shift; - } - - hwc->config |= enc; - return 0; -} - -static const struct pmu pmu = { - .enable = sparc_pmu_enable, - .disable = sparc_pmu_disable, - .read = sparc_pmu_read, - .unthrottle = sparc_pmu_unthrottle, -}; - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - int err = __hw_perf_counter_init(counter); - - if (err) - return ERR_PTR(err); - return &pmu; -} - -void perf_counter_print_debug(void) -{ - unsigned long flags; - u64 pcr, pic; - int cpu; - - if (!sparc_pmu) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - - pcr = pcr_ops->read(); - read_pic(pic); - - pr_info("\n"); - pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n", - cpu, pcr, pic); - - local_irq_restore(flags); -} - -static int __kprobes perf_counter_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) -{ - struct die_args *args = __args; - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct pt_regs *regs; - int idx; - - if (!atomic_read(&active_counters)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - for (idx = 0; idx < MAX_HWCOUNTERS; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; - struct hw_perf_counter *hwc; - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - hwc = &counter->hw; - val = sparc_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << 31)) - continue; - - data.period = counter->hw.last_period; - if (!sparc_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - sparc_pmu_disable_counter(hwc, idx); - } - - return NOTIFY_STOP; -} - -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, -}; - -static bool __init supported_pmu(void) -{ - if (!strcmp(sparc_pmu_type, "ultra3i")) { - sparc_pmu = &ultra3i_pmu; - return true; - } - if (!strcmp(sparc_pmu_type, "niagara2")) { - sparc_pmu = &niagara2_pmu; - return true; - } - return false; -} - -void __init init_hw_perf_counters(void) -{ - pr_info("Performance counters: "); - - if (!supported_pmu()) { - pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); - return; - } - - pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); - - /* All sparc64 PMUs currently have 2 counters. But this simple - * driver only supports one active counter at a time. - */ - perf_max_counters = 1; - - register_die_notifier(&perf_counter_nmi_notifier); -} diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c new file mode 100644 index 00000000000..2d6a1b10c81 --- /dev/null +++ b/arch/sparc/kernel/perf_event.c @@ -0,0 +1,556 @@ +/* Performance event support for sparc64. + * + * Copyright (C) 2009 David S. Miller + * + * This code is based almost entirely upon the x86 perf event + * code, which is: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Sparc64 chips have two performance counters, 32-bits each, with + * overflow interrupts generated on transition from 0xffffffff to 0. + * The counters are accessed in one go using a 64-bit register. + * + * Both counters are controlled using a single control register. The + * only way to stop all sampling is to clear all of the context (user, + * supervisor, hypervisor) sampling enable bits. But these bits apply + * to both counters, thus the two counters can't be enabled/disabled + * individually. + * + * The control register has two event fields, one for each of the two + * counters. It's thus nearly impossible to have one counter going + * while keeping the other one stopped. Therefore it is possible to + * get overflow interrupts for counters not currently "in use" and + * that condition must be checked in the overflow interrupt handler. + * + * So we use a hack, in that we program inactive counters with the + * "sw_count0" and "sw_count1" events. These count how many times + * the instruction "sethi %hi(0xfc000), %g0" is executed. It's an + * unusual way to encode a NOP and therefore will not trigger in + * normal code. + */ + +#define MAX_HWEVENTS 2 +#define MAX_PERIOD ((1UL << 32) - 1) + +#define PIC_UPPER_INDEX 0 +#define PIC_LOWER_INDEX 1 + +struct cpu_hw_events { + struct perf_event *events[MAX_HWEVENTS]; + unsigned long used_mask[BITS_TO_LONGS(MAX_HWEVENTS)]; + unsigned long active_mask[BITS_TO_LONGS(MAX_HWEVENTS)]; + int enabled; +}; +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; + +struct perf_event_map { + u16 encoding; + u8 pic_mask; +#define PIC_NONE 0x00 +#define PIC_UPPER 0x01 +#define PIC_LOWER 0x02 +}; + +struct sparc_pmu { + const struct perf_event_map *(*event_map)(int); + int max_events; + int upper_shift; + int lower_shift; + int event_mask; + int hv_bit; + int irq_bit; + int upper_nop; + int lower_nop; +}; + +static const struct perf_event_map ultra3i_perfmon_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER }, + [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER }, +}; + +static const struct perf_event_map *ultra3i_event_map(int event_id) +{ + return &ultra3i_perfmon_event_map[event_id]; +} + +static const struct sparc_pmu ultra3i_pmu = { + .event_map = ultra3i_event_map, + .max_events = ARRAY_SIZE(ultra3i_perfmon_event_map), + .upper_shift = 11, + .lower_shift = 4, + .event_mask = 0x3f, + .upper_nop = 0x1c, + .lower_nop = 0x14, +}; + +static const struct perf_event_map niagara2_perfmon_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER }, +}; + +static const struct perf_event_map *niagara2_event_map(int event_id) +{ + return &niagara2_perfmon_event_map[event_id]; +} + +static const struct sparc_pmu niagara2_pmu = { + .event_map = niagara2_event_map, + .max_events = ARRAY_SIZE(niagara2_perfmon_event_map), + .upper_shift = 19, + .lower_shift = 6, + .event_mask = 0xfff, + .hv_bit = 0x8, + .irq_bit = 0x03, + .upper_nop = 0x220, + .lower_nop = 0x220, +}; + +static const struct sparc_pmu *sparc_pmu __read_mostly; + +static u64 event_encoding(u64 event_id, int idx) +{ + if (idx == PIC_UPPER_INDEX) + event_id <<= sparc_pmu->upper_shift; + else + event_id <<= sparc_pmu->lower_shift; + return event_id; +} + +static u64 mask_for_index(int idx) +{ + return event_encoding(sparc_pmu->event_mask, idx); +} + +static u64 nop_for_index(int idx) +{ + return event_encoding(idx == PIC_UPPER_INDEX ? + sparc_pmu->upper_nop : + sparc_pmu->lower_nop, idx); +} + +static inline void sparc_pmu_enable_event(struct hw_perf_event *hwc, + int idx) +{ + u64 val, mask = mask_for_index(idx); + + val = pcr_ops->read(); + pcr_ops->write((val & ~mask) | hwc->config); +} + +static inline void sparc_pmu_disable_event(struct hw_perf_event *hwc, + int idx) +{ + u64 mask = mask_for_index(idx); + u64 nop = nop_for_index(idx); + u64 val = pcr_ops->read(); + + pcr_ops->write((val & ~mask) | nop); +} + +void hw_perf_enable(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + int i; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + val = pcr_ops->read(); + + for (i = 0; i < MAX_HWEVENTS; i++) { + struct perf_event *cp = cpuc->events[i]; + struct hw_perf_event *hwc; + + if (!cp) + continue; + hwc = &cp->hw; + val |= hwc->config_base; + } + + pcr_ops->write(val); +} + +void hw_perf_disable(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + + val = pcr_ops->read(); + val &= ~(PCR_UTRACE | PCR_STRACE | + sparc_pmu->hv_bit | sparc_pmu->irq_bit); + pcr_ops->write(val); +} + +static u32 read_pmc(int idx) +{ + u64 val; + + read_pic(val); + if (idx == PIC_UPPER_INDEX) + val >>= 32; + + return val & 0xffffffff; +} + +static void write_pmc(int idx, u64 val) +{ + u64 shift, mask, pic; + + shift = 0; + if (idx == PIC_UPPER_INDEX) + shift = 32; + + mask = ((u64) 0xffffffff) << shift; + val <<= shift; + + read_pic(pic); + pic &= ~mask; + pic |= val; + write_pic(pic); +} + +static int sparc_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int ret = 0; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + if (left > MAX_PERIOD) + left = MAX_PERIOD; + + atomic64_set(&hwc->prev_count, (u64)-left); + + write_pmc(idx, (u64)(-left) & 0xffffffff); + + perf_event_update_userpage(event); + + return ret; +} + +static int sparc_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + sparc_pmu_disable_event(hwc, idx); + + cpuc->events[idx] = event; + set_bit(idx, cpuc->active_mask); + + sparc_perf_event_set_period(event, hwc, idx); + sparc_pmu_enable_event(hwc, idx); + perf_event_update_userpage(event); + return 0; +} + +static u64 sparc_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + int shift = 64 - 32; + u64 prev_raw_count, new_raw_count; + s64 delta; + +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + new_raw_count = read_pmc(idx); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &event->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static void sparc_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + clear_bit(idx, cpuc->active_mask); + sparc_pmu_disable_event(hwc, idx); + + barrier(); + + sparc_perf_event_update(event, hwc, idx); + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + +static void sparc_pmu_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + sparc_perf_event_update(event, hwc, hwc->idx); +} + +static void sparc_pmu_unthrottle(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + sparc_pmu_enable_event(hwc, hwc->idx); +} + +static atomic_t active_events = ATOMIC_INIT(0); +static DEFINE_MUTEX(pmc_grab_mutex); + +void perf_event_grab_pmc(void) +{ + if (atomic_inc_not_zero(&active_events)) + return; + + mutex_lock(&pmc_grab_mutex); + if (atomic_read(&active_events) == 0) { + if (atomic_read(&nmi_active) > 0) { + on_each_cpu(stop_nmi_watchdog, NULL, 1); + BUG_ON(atomic_read(&nmi_active) != 0); + } + atomic_inc(&active_events); + } + mutex_unlock(&pmc_grab_mutex); +} + +void perf_event_release_pmc(void) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_grab_mutex)) { + if (atomic_read(&nmi_active) == 0) + on_each_cpu(start_nmi_watchdog, NULL, 1); + mutex_unlock(&pmc_grab_mutex); + } +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + perf_event_release_pmc(); +} + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + const struct perf_event_map *pmap; + u64 enc; + + if (atomic_read(&nmi_active) < 0) + return -ENODEV; + + if (attr->type != PERF_TYPE_HARDWARE) + return -EOPNOTSUPP; + + if (attr->config >= sparc_pmu->max_events) + return -EINVAL; + + perf_event_grab_pmc(); + event->destroy = hw_perf_event_destroy; + + /* We save the enable bits in the config_base. So to + * turn off sampling just write 'config', and to enable + * things write 'config | config_base'. + */ + hwc->config_base = sparc_pmu->irq_bit; + if (!attr->exclude_user) + hwc->config_base |= PCR_UTRACE; + if (!attr->exclude_kernel) + hwc->config_base |= PCR_STRACE; + if (!attr->exclude_hv) + hwc->config_base |= sparc_pmu->hv_bit; + + if (!hwc->sample_period) { + hwc->sample_period = MAX_PERIOD; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } + + pmap = sparc_pmu->event_map(attr->config); + + enc = pmap->encoding; + if (pmap->pic_mask & PIC_UPPER) { + hwc->idx = PIC_UPPER_INDEX; + enc <<= sparc_pmu->upper_shift; + } else { + hwc->idx = PIC_LOWER_INDEX; + enc <<= sparc_pmu->lower_shift; + } + + hwc->config |= enc; + return 0; +} + +static const struct pmu pmu = { + .enable = sparc_pmu_enable, + .disable = sparc_pmu_disable, + .read = sparc_pmu_read, + .unthrottle = sparc_pmu_unthrottle, +}; + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + int err = __hw_perf_event_init(event); + + if (err) + return ERR_PTR(err); + return &pmu; +} + +void perf_event_print_debug(void) +{ + unsigned long flags; + u64 pcr, pic; + int cpu; + + if (!sparc_pmu) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + + pcr = pcr_ops->read(); + read_pic(pic); + + pr_info("\n"); + pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n", + cpu, pcr, pic); + + local_irq_restore(flags); +} + +static int __kprobes perf_event_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct pt_regs *regs; + int idx; + + if (!atomic_read(&active_events)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + for (idx = 0; idx < MAX_HWEVENTS; idx++) { + struct perf_event *event = cpuc->events[idx]; + struct hw_perf_event *hwc; + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + hwc = &event->hw; + val = sparc_perf_event_update(event, hwc, idx); + if (val & (1ULL << 31)) + continue; + + data.period = event->hw.last_period; + if (!sparc_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + sparc_pmu_disable_event(hwc, idx); + } + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, +}; + +static bool __init supported_pmu(void) +{ + if (!strcmp(sparc_pmu_type, "ultra3i")) { + sparc_pmu = &ultra3i_pmu; + return true; + } + if (!strcmp(sparc_pmu_type, "niagara2")) { + sparc_pmu = &niagara2_pmu; + return true; + } + return false; +} + +void __init init_hw_perf_events(void) +{ + pr_info("Performance events: "); + + if (!supported_pmu()) { + pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); + return; + } + + pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); + + /* All sparc64 PMUs currently have 2 events. But this simple + * driver only supports one active event at a time. + */ + perf_max_events = 1; + + register_die_notifier(&perf_event_nmi_notifier); +} diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 04181577cb6..0f1658d3749 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -82,5 +82,5 @@ sys_call_table: /*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate /*315*/ .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv -/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open +/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 91b06b7f7ac..009825f6e73 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -83,7 +83,7 @@ sys_call_table32: /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv - .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_counter_open + .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open #endif /* CONFIG_COMPAT */ @@ -158,4 +158,4 @@ sys_call_table: /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv - .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open + .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 51c59015b28..e4ff5d1280c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_COUNTERS if (!M386 && !M486) + select HAVE_PERF_EVENTS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ba331bfd111..74619c4f9fd 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -831,5 +831,5 @@ ia32_sys_call_table: .quad compat_sys_preadv .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_counter_open + .quad sys_perf_event_open ia32_syscall_end: diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 5e3f2044f0d..f5693c81a1d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h deleted file mode 100644 index e7b7c938ae2..00000000000 --- a/arch/x86/include/asm/perf_counter.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef _ASM_X86_PERF_COUNTER_H -#define _ASM_X86_PERF_COUNTER_H - -/* - * Performance counter hw details: - */ - -#define X86_PMC_MAX_GENERIC 8 -#define X86_PMC_MAX_FIXED 3 - -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_MAX 64 - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -/* - * Includes eventsel and unit mask as well: - */ -#define ARCH_PERFMON_EVENT_MASK 0xffff - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ - (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) - -#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 - -/* - * Intel "Architectural Performance Monitoring" CPUID - * detection/enumeration details: - */ -union cpuid10_eax { - struct { - unsigned int version_id:8; - unsigned int num_counters:8; - unsigned int bit_width:8; - unsigned int mask_length:8; - } split; - unsigned int full; -}; - -union cpuid10_edx { - struct { - unsigned int num_counters_fixed:4; - unsigned int reserved:28; - } split; - unsigned int full; -}; - - -/* - * Fixed-purpose performance counters: - */ - -/* - * All 3 fixed-mode PMCs are configured via this single MSR: - */ -#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d - -/* - * The counts are available in three separate MSRs: - */ - -/* Instr_Retired.Any: */ -#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) - -/* CPU_CLK_Unhalted.Core: */ -#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) - -/* CPU_CLK_Unhalted.Ref: */ -#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) - -/* - * We model BTS tracing as another fixed-mode PMC. - * - * We choose a value in the middle of the fixed counter range, since lower - * values are used by actual fixed counters and higher values are used - * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. - */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) - - -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(void); - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#else -static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(void) { } -#endif - -#endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h new file mode 100644 index 00000000000..ad7ce3fd506 --- /dev/null +++ b/arch/x86/include/asm/perf_event.h @@ -0,0 +1,108 @@ +#ifndef _ASM_X86_PERF_EVENT_H +#define _ASM_X86_PERF_EVENT_H + +/* + * Performance event hw details: + */ + +#define X86_PMC_MAX_GENERIC 8 +#define X86_PMC_MAX_FIXED 3 + +#define X86_PMC_IDX_GENERIC 0 +#define X86_PMC_IDX_FIXED 32 +#define X86_PMC_IDX_MAX 64 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +/* + * Includes eventsel and unit mask as well: + */ +#define ARCH_PERFMON_EVENT_MASK 0xffff + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 + +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_events:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +union cpuid10_edx { + struct { + unsigned int num_events_fixed:4; + unsigned int reserved:28; + } split; + unsigned int full; +}; + + +/* + * Fixed-purpose performance events: + */ + +/* + * All 3 fixed-mode PMCs are configured via this single MSR: + */ +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d + +/* + * The counts are available in three separate MSRs: + */ + +/* Instr_Retired.Any: */ +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) + +/* CPU_CLK_Unhalted.Core: */ +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) + +/* CPU_CLK_Unhalted.Ref: */ +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) + +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed event range, since lower + * values are used by actual fixed events and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +extern void perf_events_lapic_init(void); + +#define PERF_EVENT_INDEX_OFFSET 0 + +#else +static inline void init_hw_perf_events(void) { } +static inline void perf_events_lapic_init(void) { } +#endif + +#endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 8deaada61bc..6fb3c209a7e 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -341,7 +341,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index b9f3c60de5f..8d3ad0adbc6 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv) __SYSCALL(__NR_pwritev, sys_pwritev) #define __NR_rt_tgsigqueueinfo 297 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 298 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 298 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a34601f5298..754174d09de 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ -#include +#include #include #include #include @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include @@ -1189,7 +1189,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(); + perf_events_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 8dd30638fe4..68537e957a9 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2fea97eccf7..cc25c2b4a56 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include #include @@ -869,7 +869,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c deleted file mode 100644 index b1f115696c8..00000000000 --- a/arch/x86/kernel/cpu/perf_counter.c +++ /dev/null @@ -1,2298 +0,0 @@ -/* - * Performance counter x86 architecture code - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright (C) 2009 Intel Corporation, - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -static u64 perf_counter_mask __read_mostly; - -/* The maximal number of PEBS counters: */ -#define MAX_PEBS_COUNTERS 4 - -/* The size of a BTS record in bytes: */ -#define BTS_RECORD_SIZE 24 - -/* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) - -/* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) - - -/* - * Bits in the debugctlmsr controlling branch tracing. - */ -#define X86_DEBUGCTL_TR (1 << 6) -#define X86_DEBUGCTL_BTS (1 << 7) -#define X86_DEBUGCTL_BTINT (1 << 8) -#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) -#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) - -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; -}; - -struct cpu_hw_counters { - struct perf_counter *counters[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long interrupts; - int enabled; - struct debug_store *ds; -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(void); - void (*enable)(struct hw_perf_counter *, int); - void (*disable)(struct hw_perf_counter *, int); - unsigned eventsel; - unsigned perfctr; - u64 (*event_map)(int); - u64 (*raw_event)(u64); - int max_events; - int num_counters; - int num_counters_fixed; - int counter_bits; - u64 counter_mask; - int apic; - u64 max_period; - u64 intel_ctrl; - void (*enable_bts)(u64 config); - void (*disable_bts)(void); -}; - -static struct x86_pmu x86_pmu __read_mostly; - -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { - .enabled = 1, -}; - -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Counter setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_COUNTER 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_COUNTER_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -static u64 __read_mostly hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; - -static const u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_COUNTER_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static const u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_COUNTER_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - -/* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. - * Returns the delta events processed. - */ -static u64 -x86_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - int shift = 64 - x86_pmu.counter_bits; - u64 prev_raw_count, new_raw_count; - s64 delta; - - if (idx == X86_PMC_IDX_FIXED_BTS) - return 0; - - /* - * Careful: an NMI might modify the previous counter value. - * - * Our tactic to handle this is to first atomically read and - * exchange a new raw count - then add that new-prev delta - * count to the generic counter atomically: - */ -again: - prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->counter_base + idx, new_raw_count); - - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - /* - * Now we have the new raw value and have updated the prev - * timestamp already. We can now calculate the elapsed delta - * (counter-)time and add that to the generic counter. - * - * Careful, not all hw sign-extends above the physical width - * of the count. - */ - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); - - return new_raw_count; -} - -static atomic_t active_counters; -static DEFINE_MUTEX(pmc_reserve_mutex); - -static bool reserve_pmc_hardware(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - int i; - - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) - goto perfctr_fail; - } - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) - goto eventsel_fail; - } -#endif - - return true; - -#ifdef CONFIG_X86_LOCAL_APIC -eventsel_fail: - for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); - - i = x86_pmu.num_counters; - -perfctr_fail: - for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); - - return false; -#endif -} - -static void release_pmc_hardware(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - int i; - - for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); - } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); -#endif -} - -static inline bool bts_available(void) -{ - return x86_pmu.enable_bts != NULL; -} - -static inline void init_debug_store_on_cpu(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; - - if (!ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, - (u32)((u64)(unsigned long)ds), - (u32)((u64)(unsigned long)ds >> 32)); -} - -static inline void fini_debug_store_on_cpu(int cpu) -{ - if (!per_cpu(cpu_hw_counters, cpu).ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); -} - -static void release_bts_hardware(void) -{ - int cpu; - - if (!bts_available()) - return; - - get_online_cpus(); - - for_each_online_cpu(cpu) - fini_debug_store_on_cpu(cpu); - - for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_counters, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); - } - - put_online_cpus(); -} - -static int reserve_bts_hardware(void) -{ - int cpu, err = 0; - - if (!bts_available()) - return 0; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - - err = -ENOMEM; - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) { - kfree(buffer); - break; - } - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = - ds->bts_buffer_base + BTS_BUFFER_SIZE; - ds->bts_interrupt_threshold = - ds->bts_absolute_maximum - BTS_OVFL_TH; - - per_cpu(cpu_hw_counters, cpu).ds = ds; - err = 0; - } - - if (err) - release_bts_hardware(); - else { - for_each_online_cpu(cpu) - init_debug_store_on_cpu(cpu); - } - - put_online_cpus(); - - return err; -} - -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { - release_pmc_hardware(); - release_bts_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -static inline int x86_pmu_initialized(void) -{ - return x86_pmu.handle_irq != NULL; -} - -static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) -{ - unsigned int cache_type, cache_op, cache_result; - u64 config, val; - - config = attr->config; - - cache_type = (config >> 0) & 0xff; - if (cache_type >= PERF_COUNT_HW_CACHE_MAX) - return -EINVAL; - - cache_op = (config >> 8) & 0xff; - if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) - return -EINVAL; - - cache_result = (config >> 16) & 0xff; - if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - val = hw_cache_event_ids[cache_type][cache_op][cache_result]; - - if (val == 0) - return -ENOENT; - - if (val == -1) - return -EINVAL; - - hwc->config |= val; - - return 0; -} - -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - -/* - * Setup the hardware configuration for a given attr_type - */ -static int __hw_perf_counter_init(struct perf_counter *counter) -{ - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; - u64 config; - int err; - - if (!x86_pmu_initialized()) - return -ENODEV; - - err = 0; - if (!atomic_inc_not_zero(&active_counters)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0) { - if (!reserve_pmc_hardware()) - err = -EBUSY; - else - err = reserve_bts_hardware(); - } - if (!err) - atomic_inc(&active_counters); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; - - counter->destroy = hw_perf_counter_destroy; - - /* - * Generate PMC IRQs: - * (keep 'enabled' bit clear for now) - */ - hwc->config = ARCH_PERFMON_EVENTSEL_INT; - - /* - * Count user and OS events unless requested not to. - */ - if (!attr->exclude_user) - hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!attr->exclude_kernel) - hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - - if (!hwc->sample_period) { - hwc->sample_period = x86_pmu.max_period; - hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); - } else { - /* - * If we have a PMU initialized but no APIC - * interrupts, we cannot sample hardware - * counters (user-space has to fall back and - * sample via a hrtimer based software counter): - */ - if (!x86_pmu.apic) - return -EOPNOTSUPP; - } - - /* - * Raw hw_event type provide the config in the hw_event structure - */ - if (attr->type == PERF_TYPE_RAW) { - hwc->config |= x86_pmu.raw_event(attr->config); - return 0; - } - - if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); - - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - - /* - * The generic map: - */ - config = x86_pmu.event_map(attr->config); - - if (config == 0) - return -ENOENT; - - if (config == -1LL) - return -EINVAL; - - /* - * Branch tracing: - */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { - /* BTS is not supported by this architecture. */ - if (!bts_available()) - return -EOPNOTSUPP; - - /* BTS is currently only allowed for user-mode. */ - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - return -EOPNOTSUPP; - } - - hwc->config |= config; - - return 0; -} - -static void p6_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); -} - -static void amd_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int idx; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * counters proper, so that amd_pmu_enable_counter() does the - * right thing. - */ - barrier(); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } -} - -void hw_perf_disable(void) -{ - if (!x86_pmu_initialized()) - return; - return x86_pmu.disable_all(); -} - -static void p6_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long val; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_counter *counter = - cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - - if (WARN_ON_ONCE(!counter)) - return; - - intel_pmu_enable_bts(counter->hw.config); - } -} - -static void amd_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int idx; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - - val = counter->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } -} - -void hw_perf_enable(void) -{ - if (!x86_pmu_initialized()) - return; - x86_pmu.enable_all(); -} - -static inline u64 intel_pmu_get_status(void) -{ - u64 status; - - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} - -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} - -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); -} - -static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; - - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val = P6_NOP_COUNTER; - - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - -static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); - return; - } - - x86_pmu_disable_counter(hwc, idx); -} - -static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - x86_pmu_disable_counter(hwc, idx); -} - -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); - -/* - * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: - */ -static int -x86_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int err, ret = 0; - - if (idx == X86_PMC_IDX_FIXED_BTS) - return 0; - - /* - * If we are way outside a reasoable range then just skip forward: - */ - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - - if (unlikely(left <= 0)) { - left += period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - /* - * Quirk: certain CPUs dont like it if just 1 hw_event is left: - */ - if (unlikely(left < 2)) - left = 2; - - if (left > x86_pmu.max_period) - left = x86_pmu.max_period; - - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - - /* - * The hw counter starts counting from this counter offset, - * mark it to be able to extra future deltas: - */ - atomic64_set(&hwc->prev_count, (u64)-left); - - err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & x86_pmu.counter_mask); - - perf_counter_update_userpage(counter); - - return ret; -} - -static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - - -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_counters).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - x86_pmu_enable_counter(hwc, idx); -} - -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); -} - -static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; - - if (!x86_pmu.num_counters_fixed) - return -1; - - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} - -/* - * Find a PMC slot for the freshly enabled / scheduled in counter: - */ -static int x86_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx; - - idx = fixed_mode_idx(counter, hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->counter_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed counter, if that is already taken - * then try to get a generic counter: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that counter_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->counter_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_counters); - if (idx == x86_pmu.num_counters) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->counter_base = x86_pmu.perfctr; - } - - perf_counters_lapic_init(); - - x86_pmu.disable(hwc, idx); - - cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active_mask); - - x86_perf_counter_set_period(counter, hwc, idx); - x86_pmu.enable(hwc, idx); - - perf_counter_update_userpage(counter); - - return 0; -} - -static void x86_pmu_unthrottle(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->counters[hwc->idx] != counter)) - return; - - x86_pmu.enable(hwc, hwc->idx); -} - -void perf_counter_print_debug(void) -{ - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - struct cpu_hw_counters *cpuc; - unsigned long flags; - int cpu, idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - if (x86_pmu.version >= 2) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - - pr_info("\n"); - pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); - pr_info("CPU#%d: status: %016llx\n", cpu, status); - pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); - pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); - - prev_left = per_cpu(pmc_prev_left[idx], cpu); - - pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", - cpu, idx, pmc_ctrl); - pr_info("CPU#%d: gen-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - pr_info("CPU#%d: gen-PMC%d left: %016llx\n", - cpu, idx, prev_left); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - - pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - } - local_irq_restore(flags); -} - -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!counter) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = counter->hw.last_period; - data.addr = 0; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, counter, ®s); - - if (perf_output_begin(&handle, counter, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, counter); - } - - perf_output_end(&handle); - - /* There's new data available. */ - counter->hw.interrupts++; - counter->pending_kill = POLL_IN; -} - -static void x86_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); - - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the counter: - */ - barrier(); - - /* - * Drain the remaining delta count out of a counter - * that we are disabling: - */ - x86_perf_counter_update(counter, hwc, idx); - - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); - - cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_counter_update_userpage(counter); -} - -/* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: - */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - int ret; - - x86_perf_counter_update(counter, hwc, idx); - ret = x86_perf_counter_set_period(counter, hwc, idx); - - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - intel_pmu_enable_counter(hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); -} - -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - counter = cpuc->counters[idx]; - hwc = &counter->hw; - - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; - - /* - * counter overflow - */ - handled = 1; - data.period = counter->hw.last_period; - - if (!x86_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - p6_pmu_disable_counter(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } - - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); - perf_counter_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(counter)) - continue; - - data.period = counter->hw.last_period; - - if (perf_counter_overflow(counter, 1, &data, regs)) - intel_pmu_disable_counter(&counter->hw, bit); - } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; -} - -static int amd_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - counter = cpuc->counters[idx]; - hwc = &counter->hw; - - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; - - /* - * counter overflow - */ - handled = 1; - data.period = counter->hw.last_period; - - if (!x86_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - amd_pmu_disable_counter(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_counter_do_pending(); - irq_exit(); -} - -void set_perf_counter_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - -void perf_counters_lapic_init(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - /* - * Always use NMI for PMU - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif -} - -static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) -{ - struct die_args *args = __args; - struct pt_regs *regs; - - if (!atomic_read(&active_counters)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - case DIE_NMI_IPI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; - -#ifdef CONFIG_X86_LOCAL_APIC - apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif - /* - * Can't rely on the handled return value to say it was our NMI, two - * counters could trigger 'simultaneously' raising two back-to-back NMIs. - * - * If the first NMI handles both, the latter will be empty and daze - * the CPU. - */ - x86_pmu.handle_irq(regs); - - return NOTIFY_STOP; -} - -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, - .next = NULL, - .priority = 1 -}; - -static struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = p6_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_counter, - .disable = p6_pmu_disable_counter, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_counters = 2, - /* - * Counters have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a counter for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .counter_bits = 32, - .counter_mask = (1ULL << 32) - 1, -}; - -static struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_counter, - .disable = intel_pmu_disable_counter, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic counter period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, -}; - -static struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_counter, - .disable = amd_pmu_disable_counter, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, - .counter_bits = 48, - .counter_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, -}; - -static int p6_pmu_init(void) -{ - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - break; - case 9: - case 13: - /* Pentium M */ - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } - - x86_pmu = p6_pmu; - - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } - - return 0; -} - -static int intel_pmu_init(void) -{ - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } - - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; - - version = eax.split.version_id; - if (version < 2) - return -ENODEV; - - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; - - /* - * Quirk: v2 perfmon does not report fixed-purpose counters, so - * assume at least 3 counters: - */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - - /* - * Install the hw-cache-events table: - */ - switch (boot_cpu_data.x86_model) { - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Core2 events, "); - break; - default: - case 26: - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Nehalem/Corei7 events, "); - break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Atom events, "); - break; - } - return 0; -} - -static int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; - - x86_pmu = amd_pmu; - - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - return 0; -} - -void __init init_hw_perf_counters(void) -{ - int err; - - pr_info("Performance Counters: "); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - err = intel_pmu_init(); - break; - case X86_VENDOR_AMD: - err = amd_pmu_init(); - break; - default: - return; - } - if (err != 0) { - pr_cont("no PMU driver, software counters only.\n"); - return; - } - - pr_cont("%s PMU driver.\n", x86_pmu.name); - - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - perf_counter_mask = (1 << x86_pmu.num_counters) - 1; - perf_max_counters = x86_pmu.num_counters; - - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - perf_counter_mask |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_counter_mask; - - perf_counters_lapic_init(); - register_die_notifier(&perf_counter_nmi_notifier); - - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); - pr_info("... generic counters: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); - pr_info("... counter mask: %016Lx\n", perf_counter_mask); -} - -static inline void x86_pmu_read(struct perf_counter *counter) -{ - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); -} - -static const struct pmu pmu = { - .enable = x86_pmu_enable, - .disable = x86_pmu_disable, - .read = x86_pmu_read, - .unthrottle = x86_pmu_unthrottle, -}; - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - int err; - - err = __hw_perf_counter_init(counter); - if (err) { - if (counter->destroy) - counter->destroy(counter); - return ERR_PTR(err); - } - - return &pmu; -} - -/* - * callchain support - */ - -static inline -void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); - - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); - - return 0; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct perf_callchain_entry *entry = data; - - if (per_cpu(in_nmi_frame, smp_processor_id())) - return; - - if (reliable) - callchain_store(entry, addr); -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, -}; - -#include "../dumpstack.h" - -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->ip); - - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); -} - -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page, type); - memcpy(to, map+offset, size); - kunmap_atomic(map, type); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - unsigned long bytes; - - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); - - return bytes == sizeof(*frame); -} - -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - struct stack_frame frame; - const void __user *fp; - - if (!user_mode(regs)) - regs = task_pt_regs(current); - - fp = (void __user *)regs->bp; - - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->ip); - - while (entry->nr < PERF_MAX_STACK_DEPTH) { - frame.next_frame = NULL; - frame.return_address = 0; - - if (!copy_stack_frame(fp, &frame)) - break; - - if ((unsigned long)fp < regs->sp) - break; - - callchain_store(entry, frame.return_address); - fp = frame.next_frame; - } -} - -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} - -void hw_perf_counter_setup_online(int cpu) -{ - init_debug_store_on_cpu(cpu); -} diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c new file mode 100644 index 00000000000..0d03629fb1a --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.c @@ -0,0 +1,2298 @@ +/* + * Performance events x86 architecture code + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2009 Intel Corporation, + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static u64 perf_event_mask __read_mostly; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +struct cpu_hw_events { + struct perf_event *events[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long interrupts; + int enabled; + struct debug_store *ds; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(void); + void (*enable)(struct hw_perf_event *, int); + void (*disable)(struct hw_perf_event *, int); + unsigned eventsel; + unsigned perfctr; + u64 (*event_map)(int); + u64 (*raw_event)(u64); + int max_events; + int num_events; + int num_events_fixed; + int event_bits; + u64 event_mask; + int apic; + u64 max_period; + u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .enabled = 1, +}; + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ + CORE_EVNTSEL_REG_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + int shift = 64 - x86_pmu.event_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; + + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->event_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &event->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static atomic_t active_events; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < x86_pmu.num_events; i++) { + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < x86_pmu.num_events; i++) { + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + goto eventsel_fail; + } +#endif + + return true; + +#ifdef CONFIG_X86_LOCAL_APIC +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(x86_pmu.eventsel + i); + + i = x86_pmu.num_events; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(x86_pmu.perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +#endif +} + +static void release_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + for (i = 0; i < x86_pmu.num_events; i++) { + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +#endif +} + +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_events, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_events, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); + release_bts_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + u64 config; + int err; + + if (!x86_pmu_initialized()) + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + err = reserve_bts_hardware(); + } + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + event->destroy = hw_perf_event_destroy; + + /* + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_INT; + + /* + * Count user and OS events unless requested not to. + */ + if (!attr->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!attr->exclude_kernel) + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + + if (!hwc->sample_period) { + hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; + } + + /* + * Raw hw_event type provide the config in the hw_event structure + */ + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); + return 0; + } + + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + + /* + * The generic map: + */ + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + /* + * Branch tracing: + */ + if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && + (hwc->sample_period == 1)) { + /* BTS is not supported by this architecture. */ + if (!bts_available()) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + return -EOPNOTSUPP; + } + + hwc->config |= config; + + return 0; +} + +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * events proper, so that amd_pmu_enable_event() does the + * right thing. + */ + barrier(); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_disable(void) +{ + if (!x86_pmu_initialized()) + return; + return x86_pmu.disable_all(); +} + +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static void amd_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + val = event->hw.config; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_enable(void) +{ + if (!x86_pmu_initialized()) + return; + x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static inline void +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_event(hwc, idx); +} + +static inline void +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + x86_pmu_disable_event(hwc, idx); +} + +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the event disabled in hw: + */ +static int +x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int err, ret = 0; + + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + /* + * Quirk: certain CPUs dont like it if just 1 hw_event is left: + */ + if (unlikely(left < 2)) + left = 2; + + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)-left); + + err = checking_wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.event_mask); + + perf_event_update_userpage(event); + + return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + + +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_event(hwc, idx); +} + +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->enabled) + x86_pmu_enable_event(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) +{ + unsigned int hw_event; + + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + + if (!x86_pmu.num_events_fixed) + return -1; + + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + + return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = fixed_mode_idx(event, hwc); + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* BTS is already occupied. */ + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + hwc->config_base = 0; + hwc->event_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { + /* + * Try to get the fixed event, if that is already taken + * then try to get a generic event: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic event again */ + if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: + idx = find_first_zero_bit(cpuc->used_mask, + x86_pmu.num_events); + if (idx == x86_pmu.num_events) + return -EAGAIN; + + set_bit(idx, cpuc->used_mask); + hwc->idx = idx; + } + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } + + perf_events_lapic_init(); + + x86_pmu.disable(hwc, idx); + + cpuc->events[idx] = event; + set_bit(idx, cpuc->active_mask); + + x86_perf_event_set_period(event, hwc, idx); + x86_pmu.enable(hwc, idx); + + perf_event_update_userpage(event); + + return 0; +} + +static void x86_pmu_unthrottle(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->events[hwc->idx] != event)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + +void perf_event_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + struct cpu_hw_events *cpuc; + unsigned long flags; + int cpu, idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_events, cpu); + + if (x86_pmu.version >= 2) { + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + } + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); + + prev_left = per_cpu(pmc_prev_left[idx], cpu); + + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", + cpu, idx, prev_left); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } + local_irq_restore(flags); +} + +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + + data.period = event->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static void x86_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active_mask); + x86_pmu.disable(hwc, idx); + + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the event: + */ + barrier(); + + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); + + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + int ret; + + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); + + if (event->state == PERF_EVENT_STATE_ACTIVE) + intel_pmu_enable_event(hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) + continue; + + /* + * event overflow + */ + handled = 1; + data.period = event->hw.last_period; + + if (!x86_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + p6_pmu_disable_event(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + perf_disable(); + intel_pmu_drain_bts_buffer(cpuc); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) + continue; + + /* + * event overflow + */ + handled = 1; + data.period = event->hw.last_period; + + if (!x86_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + amd_pmu_disable_event(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_event_do_pending(); + irq_exit(); +} + +void set_perf_event_pending(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif +} + +void perf_events_lapic_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + + /* + * Always use NMI for PMU + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif +} + +static int __kprobes +perf_event_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (!atomic_read(&active_events)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + +#ifdef CONFIG_X86_LOCAL_APIC + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif + /* + * Can't rely on the handled return value to say it was our NMI, two + * events could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, + .next = NULL, + .priority = 1 +}; + +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, +}; + +static struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, +}; + +static struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, +}; + +static int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + if (!cpu_has_apic) { + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; + } + + return 0; +} + +static int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + return -ENODEV; + + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Core2 events, "); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Atom events, "); + break; + } + return 0; +} + +static int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +void __init init_hw_perf_events(void) +{ + int err; + + pr_info("Performance Events: "); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + err = intel_pmu_init(); + break; + case X86_VENDOR_AMD: + err = amd_pmu_init(); + break; + default: + return; + } + if (err != 0) { + pr_cont("no PMU driver, software events only.\n"); + return; + } + + pr_cont("%s PMU driver.\n", x86_pmu.name); + + if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_events, X86_PMC_MAX_GENERIC); + x86_pmu.num_events = X86_PMC_MAX_GENERIC; + } + perf_event_mask = (1 << x86_pmu.num_events) - 1; + perf_max_events = x86_pmu.num_events; + + if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; + } + + perf_event_mask |= + ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_event_mask; + + perf_events_lapic_init(); + register_die_notifier(&perf_event_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic events: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); +} + +static inline void x86_pmu_read(struct perf_event *event) +{ + x86_perf_event_update(event, &event->hw, event->hw.idx); +} + +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + int err; + + err = __hw_perf_event_init(event); + if (err) { + if (event->destroy) + event->destroy(event); + return ERR_PTR(err); + } + + return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ + if (entry->nr < PERF_MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + + return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +#include "../dumpstack.h" + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + callchain_store(entry, PERF_CONTEXT_KERNEL); + callchain_store(entry, regs->ip); + + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + unsigned long bytes; + + bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + + return bytes == sizeof(*frame); +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + if (!user_mode(regs)) + regs = task_pt_regs(current); + + fp = (void __user *)regs->bp; + + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, regs->ip); + + while (entry->nr < PERF_MAX_STACK_DEPTH) { + frame.next_frame = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < regs->sp) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_frame; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(pmc_nmi_entry); + else + entry = &__get_cpu_var(pmc_irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} + +void hw_perf_event_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 392bea43b89..fab786f60ed 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include #include -#include +#include struct nmi_watchdog_ctlblk { unsigned int cccr_msr; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d59fe323807..681c3fda739 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1021,7 +1021,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 300883112e3..40f30773fb2 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -208,7 +208,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafd..0157cd26d7c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 775a020990a..82728f2c6d5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,7 +10,7 @@ #include /* max_low_pfn */ #include /* __kprobes, ... */ #include /* kmmio_handler, ... */ -#include /* perf_swcounter_event */ +#include /* perf_sw_event */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ @@ -1017,7 +1017,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1114,11 +1114,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4899215999d..8eb05878554 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void) if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { eax.split.version_id = 2; - eax.split.num_counters = 2; + eax.split.num_events = 2; eax.split.bit_width = 40; } - num_counters = eax.split.num_counters; + num_counters = eax.split.num_events; op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index b83776180c7..7b8e75d1608 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -13,7 +13,7 @@ #define OP_X86_MODEL_H #include -#include +#include struct op_msr { unsigned long addr; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 50eecfe1d72..44203ff599d 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -252,7 +252,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) struct pt_regs *regs = get_irq_regs(); if (regs) show_regs(regs); - perf_counter_print_debug(); + perf_event_print_debug(); } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, diff --git a/fs/exec.c b/fs/exec.c index 172ceb6edde..434dba778cc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include @@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) task_lock(tsk); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_counter_comm(tsk); + perf_event_comm(tsk); } int flush_old_exec(struct linux_binprm * bprm) @@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm) * security domain: */ if (!get_dumpable(current->mm)) - perf_counter_exit_task(current); + perf_event_exit_task(current); /* An exec changes our domain. We are no longer part of the thread group */ diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 1125e5a1ee5..d76b66acea9 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -620,8 +620,8 @@ __SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_rt_tgsigqueueinfo 240 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 241 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 241 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #undef __NR_syscalls #define __NR_syscalls 242 diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9e7f2e8fc66..21a6f5d9af2 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -106,13 +106,13 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_mutex = \ - __MUTEX_INITIALIZER(tsk.perf_counter_mutex), \ - .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list), +#ifdef CONFIG_PERF_EVENTS +# define INIT_PERF_EVENTS(tsk) \ + .perf_event_mutex = \ + __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ + .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), #else -# define INIT_PERF_COUNTERS(tsk) +# define INIT_PERF_EVENTS(tsk) #endif /* @@ -178,7 +178,7 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ + INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h deleted file mode 100644 index f6486273267..00000000000 --- a/include/linux/perf_counter.h +++ /dev/null @@ -1,858 +0,0 @@ -/* - * Performance counters: - * - * Copyright (C) 2008-2009, Thomas Gleixner - * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra - * - * Data type definitions, declarations, prototypes. - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * For licencing details see kernel-base/COPYING - */ -#ifndef _LINUX_PERF_COUNTER_H -#define _LINUX_PERF_COUNTER_H - -#include -#include -#include - -/* - * User-space ABI bits: - */ - -/* - * attr.type - */ -enum perf_type_id { - PERF_TYPE_HARDWARE = 0, - PERF_TYPE_SOFTWARE = 1, - PERF_TYPE_TRACEPOINT = 2, - PERF_TYPE_HW_CACHE = 3, - PERF_TYPE_RAW = 4, - - PERF_TYPE_MAX, /* non-ABI */ -}; - -/* - * Generalized performance counter event types, used by the - * attr.event_id parameter of the sys_perf_counter_open() - * syscall: - */ -enum perf_hw_id { - /* - * Common hardware events, generalized by the kernel: - */ - PERF_COUNT_HW_CPU_CYCLES = 0, - PERF_COUNT_HW_INSTRUCTIONS = 1, - PERF_COUNT_HW_CACHE_REFERENCES = 2, - PERF_COUNT_HW_CACHE_MISSES = 3, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_HW_BRANCH_MISSES = 5, - PERF_COUNT_HW_BUS_CYCLES = 6, - - PERF_COUNT_HW_MAX, /* non-ABI */ -}; - -/* - * Generalized hardware cache counters: - * - * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x - * { read, write, prefetch } x - * { accesses, misses } - */ -enum perf_hw_cache_id { - PERF_COUNT_HW_CACHE_L1D = 0, - PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_LL = 2, - PERF_COUNT_HW_CACHE_DTLB = 3, - PERF_COUNT_HW_CACHE_ITLB = 4, - PERF_COUNT_HW_CACHE_BPU = 5, - - PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ = 0, - PERF_COUNT_HW_CACHE_OP_WRITE = 1, - PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - - PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_result_id { - PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, - PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - - PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ -}; - -/* - * Special "software" counters provided by the kernel, even if the hardware - * does not support performance counters. These counters measure various - * physical and sw events of the kernel (and allow the profiling of them as - * well): - */ -enum perf_sw_ids { - PERF_COUNT_SW_CPU_CLOCK = 0, - PERF_COUNT_SW_TASK_CLOCK = 1, - PERF_COUNT_SW_PAGE_FAULTS = 2, - PERF_COUNT_SW_CONTEXT_SWITCHES = 3, - PERF_COUNT_SW_CPU_MIGRATIONS = 4, - PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, - PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, - - PERF_COUNT_SW_MAX, /* non-ABI */ -}; - -/* - * Bits that can be set in attr.sample_type to request information - * in the overflow packets. - */ -enum perf_counter_sample_format { - PERF_SAMPLE_IP = 1U << 0, - PERF_SAMPLE_TID = 1U << 1, - PERF_SAMPLE_TIME = 1U << 2, - PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_READ = 1U << 4, - PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_ID = 1U << 6, - PERF_SAMPLE_CPU = 1U << 7, - PERF_SAMPLE_PERIOD = 1U << 8, - PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_RAW = 1U << 10, - - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ -}; - -/* - * The format of the data returned by read() on a perf counter fd, - * as specified by attr.read_format: - * - * struct read_format { - * { u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 id; } && PERF_FORMAT_ID - * } && !PERF_FORMAT_GROUP - * - * { u64 nr; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 value; - * { u64 id; } && PERF_FORMAT_ID - * } cntr[nr]; - * } && PERF_FORMAT_GROUP - * }; - */ -enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, - PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, - PERF_FORMAT_ID = 1U << 2, - PERF_FORMAT_GROUP = 1U << 3, - - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ -}; - -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ - -/* - * Hardware event to monitor via a performance monitoring counter: - */ -struct perf_counter_attr { - - /* - * Major type: hardware/software/tracepoint/etc. - */ - __u32 type; - - /* - * Size of the attr structure, for fwd/bwd compat. - */ - __u32 size; - - /* - * Type specific configuration information. - */ - __u64 config; - - union { - __u64 sample_period; - __u64 sample_freq; - }; - - __u64 sample_type; - __u64 read_format; - - __u64 disabled : 1, /* off by default */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ - mmap : 1, /* include mmap data */ - comm : 1, /* include comm data */ - freq : 1, /* use freq, not period */ - inherit_stat : 1, /* per task counts */ - enable_on_exec : 1, /* next exec enables */ - task : 1, /* trace fork/exit */ - watermark : 1, /* wakeup_watermark */ - - __reserved_1 : 49; - - union { - __u32 wakeup_events; /* wakeup every n events */ - __u32 wakeup_watermark; /* bytes before wakeup */ - }; - __u32 __reserved_2; - - __u64 __reserved_3; -}; - -/* - * Ioctls that can be done on a perf counter fd: - */ -#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) -#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) -#define PERF_COUNTER_IOC_RESET _IO ('$', 3) -#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) -#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) - -enum perf_counter_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, -}; - -/* - * Structure of the page that can be mapped via mmap - */ -struct perf_counter_mmap_page { - __u32 version; /* version number of this structure */ - __u32 compat_version; /* lowest version this is compat with */ - - /* - * Bits needed to read the hw counters in user-space. - * - * u32 seq; - * s64 count; - * - * do { - * seq = pc->lock; - * - * barrier() - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; - * - * barrier(); - * } while (pc->lock != seq); - * - * NOTE: for obvious reason this only works on self-monitoring - * processes. - */ - __u32 lock; /* seqlock for synchronization */ - __u32 index; /* hardware counter identifier */ - __s64 offset; /* add to hardware counter value */ - __u64 time_enabled; /* time counter active */ - __u64 time_running; /* time counter on cpu */ - - /* - * Hole for extension of the self monitor capabilities - */ - - __u64 __reserved[123]; /* align to 1k */ - - /* - * Control data for the mmap() data buffer. - * - * User-space reading the @data_head value should issue an rmb(), on - * SMP capable platforms, after reading this value -- see - * perf_counter_wakeup(). - * - * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data. In this case - * the kernel will not over-write unread data. - */ - __u64 data_head; /* head in the data section */ - __u64 data_tail; /* user-space written tail */ -}; - -#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) -#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (2 << 0) -#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) - -struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; -}; - -enum perf_event_type { - - /* - * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: - * - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * u64 addr; - * u64 len; - * u64 pgoff; - * char filename[]; - * }; - */ - PERF_EVENT_MMAP = 1, - - /* - * struct { - * struct perf_event_header header; - * u64 id; - * u64 lost; - * }; - */ - PERF_EVENT_LOST = 2, - - /* - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * char comm[]; - * }; - */ - PERF_EVENT_COMM = 3, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * u64 time; - * }; - */ - PERF_EVENT_EXIT = 4, - - /* - * struct { - * struct perf_event_header header; - * u64 time; - * u64 id; - * u64 stream_id; - * }; - */ - PERF_EVENT_THROTTLE = 5, - PERF_EVENT_UNTHROTTLE = 6, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * { u64 time; } && PERF_SAMPLE_TIME - * }; - */ - PERF_EVENT_FORK = 7, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, tid; - * - * struct read_format values; - * }; - */ - PERF_EVENT_READ = 8, - - /* - * struct { - * struct perf_event_header header; - * - * { u64 ip; } && PERF_SAMPLE_IP - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 addr; } && PERF_SAMPLE_ADDR - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU - * { u64 period; } && PERF_SAMPLE_PERIOD - * - * { struct read_format values; } && PERF_SAMPLE_READ - * - * { u64 nr, - * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN - * - * # - * # The RAW record below is opaque data wrt the ABI - * # - * # That is, the ABI doesn't make any promises wrt to - * # the stability of its content, it may vary depending - * # on event, hardware, kernel version and phase of - * # the moon. - * # - * # In other words, PERF_SAMPLE_RAW contents are not an ABI. - * # - * - * { u32 size; - * char data[size];}&& PERF_SAMPLE_RAW - * }; - */ - PERF_EVENT_SAMPLE = 9, - - PERF_EVENT_MAX, /* non-ABI */ -}; - -enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, - - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, - - PERF_CONTEXT_MAX = (__u64)-4095, -}; - -#define PERF_FLAG_FD_NO_GROUP (1U << 0) -#define PERF_FLAG_FD_OUTPUT (1U << 1) - -#ifdef __KERNEL__ -/* - * Kernel-internal data types and definitions: - */ - -#ifdef CONFIG_PERF_COUNTERS -# include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define PERF_MAX_STACK_DEPTH 255 - -struct perf_callchain_entry { - __u64 nr; - __u64 ip[PERF_MAX_STACK_DEPTH]; -}; - -struct perf_raw_record { - u32 size; - void *data; -}; - -struct task_struct; - -/** - * struct hw_perf_counter - performance counter hardware details: - */ -struct hw_perf_counter { -#ifdef CONFIG_PERF_COUNTERS - union { - struct { /* hardware */ - u64 config; - unsigned long config_base; - unsigned long counter_base; - int idx; - }; - union { /* software */ - atomic64_t count; - struct hrtimer hrtimer; - }; - }; - atomic64_t prev_count; - u64 sample_period; - u64 last_period; - atomic64_t period_left; - u64 interrupts; - - u64 freq_count; - u64 freq_interrupts; - u64 freq_stamp; -#endif -}; - -struct perf_counter; - -/** - * struct pmu - generic performance monitoring unit - */ -struct pmu { - int (*enable) (struct perf_counter *counter); - void (*disable) (struct perf_counter *counter); - void (*read) (struct perf_counter *counter); - void (*unthrottle) (struct perf_counter *counter); -}; - -/** - * enum perf_counter_active_state - the states of a counter - */ -enum perf_counter_active_state { - PERF_COUNTER_STATE_ERROR = -2, - PERF_COUNTER_STATE_OFF = -1, - PERF_COUNTER_STATE_INACTIVE = 0, - PERF_COUNTER_STATE_ACTIVE = 1, -}; - -struct file; - -struct perf_mmap_data { - struct rcu_head rcu_head; - int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ - int nr_locked; /* nr pages mlocked */ - - atomic_t poll; /* POLL_ for wakeups */ - atomic_t events; /* event limit */ - - atomic_long_t head; /* write position */ - atomic_long_t done_head; /* completed head */ - - atomic_t lock; /* concurrent writes */ - atomic_t wakeup; /* needs a wakeup */ - atomic_t lost; /* nr records lost */ - - long watermark; /* wakeup watermark */ - - struct perf_counter_mmap_page *user_page; - void *data_pages[0]; -}; - -struct perf_pending_entry { - struct perf_pending_entry *next; - void (*func)(struct perf_pending_entry *); -}; - -/** - * struct perf_counter - performance counter kernel representation: - */ -struct perf_counter { -#ifdef CONFIG_PERF_COUNTERS - struct list_head group_entry; - struct list_head event_entry; - struct list_head sibling_list; - int nr_siblings; - struct perf_counter *group_leader; - struct perf_counter *output; - const struct pmu *pmu; - - enum perf_counter_active_state state; - atomic64_t count; - - /* - * These are the total time in nanoseconds that the counter - * has been enabled (i.e. eligible to run, and the task has - * been scheduled in, if this is a per-task counter) - * and running (scheduled onto the CPU), respectively. - * - * They are computed from tstamp_enabled, tstamp_running and - * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. - */ - u64 total_time_enabled; - u64 total_time_running; - - /* - * These are timestamps used for computing total_time_enabled - * and total_time_running when the counter is in INACTIVE or - * ACTIVE state, measured in nanoseconds from an arbitrary point - * in time. - * tstamp_enabled: the notional time when the counter was enabled - * tstamp_running: the notional time when the counter was scheduled on - * tstamp_stopped: in INACTIVE state, the notional time when the - * counter was scheduled off. - */ - u64 tstamp_enabled; - u64 tstamp_running; - u64 tstamp_stopped; - - struct perf_counter_attr attr; - struct hw_perf_counter hw; - - struct perf_counter_context *ctx; - struct file *filp; - - /* - * These accumulate total time (in nanoseconds) that children - * counters have been enabled and running, respectively. - */ - atomic64_t child_total_time_enabled; - atomic64_t child_total_time_running; - - /* - * Protect attach/detach and child_list: - */ - struct mutex child_mutex; - struct list_head child_list; - struct perf_counter *parent; - - int oncpu; - int cpu; - - struct list_head owner_entry; - struct task_struct *owner; - - /* mmap bits */ - struct mutex mmap_mutex; - atomic_t mmap_count; - struct perf_mmap_data *data; - - /* poll related */ - wait_queue_head_t waitq; - struct fasync_struct *fasync; - - /* delayed work for NMIs and such */ - int pending_wakeup; - int pending_kill; - int pending_disable; - struct perf_pending_entry pending; - - atomic_t event_limit; - - void (*destroy)(struct perf_counter *); - struct rcu_head rcu_head; - - struct pid_namespace *ns; - u64 id; -#endif -}; - -/** - * struct perf_counter_context - counter context structure - * - * Used as a container for task counters and CPU counters as well: - */ -struct perf_counter_context { - /* - * Protect the states of the counters in the list, - * nr_active, and the list: - */ - spinlock_t lock; - /* - * Protect the list of counters. Locking either mutex or lock - * is sufficient to ensure the list doesn't change; to change - * the list you need to lock both the mutex and the spinlock. - */ - struct mutex mutex; - - struct list_head group_list; - struct list_head event_list; - int nr_counters; - int nr_active; - int is_active; - int nr_stat; - atomic_t refcount; - struct task_struct *task; - - /* - * Context clock, runs when context enabled. - */ - u64 time; - u64 timestamp; - - /* - * These fields let us detect when two contexts have both - * been cloned (inherited) from a common ancestor. - */ - struct perf_counter_context *parent_ctx; - u64 parent_gen; - u64 generation; - int pin_count; - struct rcu_head rcu_head; -}; - -/** - * struct perf_counter_cpu_context - per cpu counter context structure - */ -struct perf_cpu_context { - struct perf_counter_context ctx; - struct perf_counter_context *task_ctx; - int active_oncpu; - int max_pertask; - int exclusive; - - /* - * Recursion avoidance: - * - * task, softirq, irq, nmi context - */ - int recursion[4]; -}; - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - -#ifdef CONFIG_PERF_COUNTERS - -/* - * Set by architecture code: - */ -extern int perf_max_counters; - -extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); - -extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); -extern void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu); -extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern int perf_counter_init_task(struct task_struct *child); -extern void perf_counter_exit_task(struct task_struct *child); -extern void perf_counter_free_task(struct task_struct *task); -extern void set_perf_counter_pending(void); -extern void perf_counter_do_pending(void); -extern void perf_counter_print_debug(void); -extern void __perf_disable(void); -extern bool __perf_enable(void); -extern void perf_disable(void); -extern void perf_enable(void); -extern int perf_counter_task_disable(void); -extern int perf_counter_task_enable(void); -extern int hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu); -extern void perf_counter_update_userpage(struct perf_counter *counter); - -struct perf_sample_data { - u64 type; - - u64 ip; - struct { - u32 pid; - u32 tid; - } tid_entry; - u64 time; - u64 addr; - u64 id; - u64 stream_id; - struct { - u32 cpu; - u32 reserved; - } cpu_entry; - u64 period; - struct perf_callchain_entry *callchain; - struct perf_raw_record *raw; -}; - -extern void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter); -extern void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter, - struct pt_regs *regs); - -extern int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs); - -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return (counter->attr.type != PERF_TYPE_RAW) && - (counter->attr.type != PERF_TYPE_HARDWARE) && - (counter->attr.type != PERF_TYPE_HW_CACHE); -} - -extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); - -static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) -{ - if (atomic_read(&perf_swcounter_enabled[event])) - __perf_swcounter_event(event, nr, nmi, regs, addr); -} - -extern void __perf_counter_mmap(struct vm_area_struct *vma); - -static inline void perf_counter_mmap(struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_EXEC) - __perf_counter_mmap(vma); -} - -extern void perf_counter_comm(struct task_struct *tsk); -extern void perf_counter_fork(struct task_struct *tsk); - -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); - -extern int sysctl_perf_counter_paranoid; -extern int sysctl_perf_counter_mlock; -extern int sysctl_perf_counter_sample_rate; - -extern void perf_counter_init(void); -extern void perf_tpcounter_event(int event_id, u64 addr, u64 count, - void *record, int entry_size); - -#ifndef perf_misc_flags -#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ - PERF_EVENT_MISC_KERNEL) -#define perf_instruction_pointer(regs) instruction_pointer(regs) -#endif - -extern int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample); -extern void perf_output_end(struct perf_output_handle *handle); -extern void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len); -#else -static inline void -perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -static inline void -perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) { } -static inline void -perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline int perf_counter_init_task(struct task_struct *child) { return 0; } -static inline void perf_counter_exit_task(struct task_struct *child) { } -static inline void perf_counter_free_task(struct task_struct *task) { } -static inline void perf_counter_do_pending(void) { } -static inline void perf_counter_print_debug(void) { } -static inline void perf_disable(void) { } -static inline void perf_enable(void) { } -static inline int perf_counter_task_disable(void) { return -EINVAL; } -static inline int perf_counter_task_enable(void) { return -EINVAL; } - -static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) { } - -static inline void perf_counter_mmap(struct vm_area_struct *vma) { } -static inline void perf_counter_comm(struct task_struct *tsk) { } -static inline void perf_counter_fork(struct task_struct *tsk) { } -static inline void perf_counter_init(void) { } - -#endif - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -#endif /* __KERNEL__ */ -#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h new file mode 100644 index 00000000000..ae9d9ed6df2 --- /dev/null +++ b/include/linux/perf_event.h @@ -0,0 +1,858 @@ +/* + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_EVENT_H +#define _LINUX_PERF_EVENT_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * Generalized performance event event_id types, used by the + * attr.event_id parameter of the sys_perf_event_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache events: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" events provided by the kernel, even if the hardware + * does not support performance events. These events measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_event_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ +}; + +/* + * The format of the data returned by read() on a perf event fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_event_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + +/* + * Hardware event_id to monitor via a performance monitoring event: + */ +struct perf_event_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + + __reserved_1 : 49; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + __u32 __reserved_2; + + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf event fd: + */ +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) + +enum perf_event_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw events in user-space. + * + * u32 seq; + * s64 count; + * + * do { + * seq = pc->lock; + * + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware event identifier */ + __s64 offset; /* add to hardware event value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on cpu */ + + /* + * Hole for extension of the self monitor capabilities + */ + + __u64 __reserved[123]; /* align to 1k */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an rmb(), on + * SMP capable platforms, after reading this value -- see + * perf_event_wakeup(). + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data. In this case + * the kernel will not over-write unread data. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ +}; + +#define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (2 << 0) +#define PERF_RECORD_MISC_HYPERVISOR (3 << 0) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +enum perf_event_type { + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ + PERF_RECORD_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * }; + */ + PERF_RECORD_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_RECORD_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * }; + */ + PERF_RECORD_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * }; + */ + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * { u64 time; } && PERF_SAMPLE_TIME + * }; + */ + PERF_RECORD_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * }; + */ + PERF_RECORD_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event_id, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * }; + */ + PERF_RECORD_SAMPLE = 9, + + PERF_RECORD_MAX, /* non-ABI */ +}; + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + +#ifdef __KERNEL__ +/* + * Kernel-internal data types and definitions: + */ + +#ifdef CONFIG_PERF_EVENTS +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PERF_MAX_STACK_DEPTH 255 + +struct perf_callchain_entry { + __u64 nr; + __u64 ip[PERF_MAX_STACK_DEPTH]; +}; + +struct perf_raw_record { + u32 size; + void *data; +}; + +struct task_struct; + +/** + * struct hw_perf_event - performance event hardware details: + */ +struct hw_perf_event { +#ifdef CONFIG_PERF_EVENTS + union { + struct { /* hardware */ + u64 config; + unsigned long config_base; + unsigned long event_base; + int idx; + }; + union { /* software */ + atomic64_t count; + struct hrtimer hrtimer; + }; + }; + atomic64_t prev_count; + u64 sample_period; + u64 last_period; + atomic64_t period_left; + u64 interrupts; + + u64 freq_count; + u64 freq_interrupts; + u64 freq_stamp; +#endif +}; + +struct perf_event; + +/** + * struct pmu - generic performance monitoring unit + */ +struct pmu { + int (*enable) (struct perf_event *event); + void (*disable) (struct perf_event *event); + void (*read) (struct perf_event *event); + void (*unthrottle) (struct perf_event *event); +}; + +/** + * enum perf_event_active_state - the states of a event + */ +enum perf_event_active_state { + PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_OFF = -1, + PERF_EVENT_STATE_INACTIVE = 0, + PERF_EVENT_STATE_ACTIVE = 1, +}; + +struct file; + +struct perf_mmap_data { + struct rcu_head rcu_head; + int nr_pages; /* nr of data pages */ + int writable; /* are we writable */ + int nr_locked; /* nr pages mlocked */ + + atomic_t poll; /* POLL_ for wakeups */ + atomic_t events; /* event_id limit */ + + atomic_long_t head; /* write position */ + atomic_long_t done_head; /* completed head */ + + atomic_t lock; /* concurrent writes */ + atomic_t wakeup; /* needs a wakeup */ + atomic_t lost; /* nr records lost */ + + long watermark; /* wakeup watermark */ + + struct perf_event_mmap_page *user_page; + void *data_pages[0]; +}; + +struct perf_pending_entry { + struct perf_pending_entry *next; + void (*func)(struct perf_pending_entry *); +}; + +/** + * struct perf_event - performance event kernel representation: + */ +struct perf_event { +#ifdef CONFIG_PERF_EVENTS + struct list_head group_entry; + struct list_head event_entry; + struct list_head sibling_list; + int nr_siblings; + struct perf_event *group_leader; + struct perf_event *output; + const struct pmu *pmu; + + enum perf_event_active_state state; + atomic64_t count; + + /* + * These are the total time in nanoseconds that the event + * has been enabled (i.e. eligible to run, and the task has + * been scheduled in, if this is a per-task event) + * and running (scheduled onto the CPU), respectively. + * + * They are computed from tstamp_enabled, tstamp_running and + * tstamp_stopped when the event is in INACTIVE or ACTIVE state. + */ + u64 total_time_enabled; + u64 total_time_running; + + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the event is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * tstamp_enabled: the notional time when the event was enabled + * tstamp_running: the notional time when the event was scheduled on + * tstamp_stopped: in INACTIVE state, the notional time when the + * event was scheduled off. + */ + u64 tstamp_enabled; + u64 tstamp_running; + u64 tstamp_stopped; + + struct perf_event_attr attr; + struct hw_perf_event hw; + + struct perf_event_context *ctx; + struct file *filp; + + /* + * These accumulate total time (in nanoseconds) that children + * events have been enabled and running, respectively. + */ + atomic64_t child_total_time_enabled; + atomic64_t child_total_time_running; + + /* + * Protect attach/detach and child_list: + */ + struct mutex child_mutex; + struct list_head child_list; + struct perf_event *parent; + + int oncpu; + int cpu; + + struct list_head owner_entry; + struct task_struct *owner; + + /* mmap bits */ + struct mutex mmap_mutex; + atomic_t mmap_count; + struct perf_mmap_data *data; + + /* poll related */ + wait_queue_head_t waitq; + struct fasync_struct *fasync; + + /* delayed work for NMIs and such */ + int pending_wakeup; + int pending_kill; + int pending_disable; + struct perf_pending_entry pending; + + atomic_t event_limit; + + void (*destroy)(struct perf_event *); + struct rcu_head rcu_head; + + struct pid_namespace *ns; + u64 id; +#endif +}; + +/** + * struct perf_event_context - event context structure + * + * Used as a container for task events and CPU events as well: + */ +struct perf_event_context { + /* + * Protect the states of the events in the list, + * nr_active, and the list: + */ + spinlock_t lock; + /* + * Protect the list of events. Locking either mutex or lock + * is sufficient to ensure the list doesn't change; to change + * the list you need to lock both the mutex and the spinlock. + */ + struct mutex mutex; + + struct list_head group_list; + struct list_head event_list; + int nr_events; + int nr_active; + int is_active; + int nr_stat; + atomic_t refcount; + struct task_struct *task; + + /* + * Context clock, runs when context enabled. + */ + u64 time; + u64 timestamp; + + /* + * These fields let us detect when two contexts have both + * been cloned (inherited) from a common ancestor. + */ + struct perf_event_context *parent_ctx; + u64 parent_gen; + u64 generation; + int pin_count; + struct rcu_head rcu_head; +}; + +/** + * struct perf_event_cpu_context - per cpu event context structure + */ +struct perf_cpu_context { + struct perf_event_context ctx; + struct perf_event_context *task_ctx; + int active_oncpu; + int max_pertask; + int exclusive; + + /* + * Recursion avoidance: + * + * task, softirq, irq, nmi context + */ + int recursion[4]; +}; + +struct perf_output_handle { + struct perf_event *event; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int sample; + int locked; + unsigned long flags; +}; + +#ifdef CONFIG_PERF_EVENTS + +/* + * Set by architecture code: + */ +extern int perf_max_events; + +extern const struct pmu *hw_perf_event_init(struct perf_event *event); + +extern void perf_event_task_sched_in(struct task_struct *task, int cpu); +extern void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu); +extern void perf_event_task_tick(struct task_struct *task, int cpu); +extern int perf_event_init_task(struct task_struct *child); +extern void perf_event_exit_task(struct task_struct *child); +extern void perf_event_free_task(struct task_struct *task); +extern void set_perf_event_pending(void); +extern void perf_event_do_pending(void); +extern void perf_event_print_debug(void); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); +extern int perf_event_task_disable(void); +extern int perf_event_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu); +extern void perf_event_update_userpage(struct perf_event *event); + +struct perf_sample_data { + u64 type; + + u64 ip; + struct { + u32 pid; + u32 tid; + } tid_entry; + u64 time; + u64 addr; + u64 id; + u64 stream_id; + struct { + u32 cpu; + u32 reserved; + } cpu_entry; + u64 period; + struct perf_callchain_entry *callchain; + struct perf_raw_record *raw; +}; + +extern void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event); +extern void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs); + +extern int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs); + +/* + * Return 1 for a software event, 0 for a hardware event + */ +static inline int is_software_event(struct perf_event *event) +{ + return (event->attr.type != PERF_TYPE_RAW) && + (event->attr.type != PERF_TYPE_HARDWARE) && + (event->attr.type != PERF_TYPE_HW_CACHE); +} + +extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); + +static inline void +perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + if (atomic_read(&perf_swevent_enabled[event_id])) + __perf_sw_event(event_id, nr, nmi, regs, addr); +} + +extern void __perf_event_mmap(struct vm_area_struct *vma); + +static inline void perf_event_mmap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_EXEC) + __perf_event_mmap(vma); +} + +extern void perf_event_comm(struct task_struct *tsk); +extern void perf_event_fork(struct task_struct *tsk); + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + +extern int sysctl_perf_event_paranoid; +extern int sysctl_perf_event_mlock; +extern int sysctl_perf_event_sample_rate; + +extern void perf_event_init(void); +extern void perf_tp_event(int event_id, u64 addr, u64 count, + void *record, int entry_size); + +#ifndef perf_misc_flags +#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ + PERF_RECORD_MISC_KERNEL) +#define perf_instruction_pointer(regs) instruction_pointer(regs) +#endif + +extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample); +extern void perf_output_end(struct perf_output_handle *handle); +extern void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len); +#else +static inline void +perf_event_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { } +static inline void +perf_event_task_tick(struct task_struct *task, int cpu) { } +static inline int perf_event_init_task(struct task_struct *child) { return 0; } +static inline void perf_event_exit_task(struct task_struct *child) { } +static inline void perf_event_free_task(struct task_struct *task) { } +static inline void perf_event_do_pending(void) { } +static inline void perf_event_print_debug(void) { } +static inline void perf_disable(void) { } +static inline void perf_enable(void) { } +static inline int perf_event_task_disable(void) { return -EINVAL; } +static inline int perf_event_task_enable(void) { return -EINVAL; } + +static inline void +perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { } + +static inline void perf_event_mmap(struct vm_area_struct *vma) { } +static inline void perf_event_comm(struct task_struct *tsk) { } +static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_init(void) { } + +#endif + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + +#endif /* __KERNEL__ */ +#endif /* _LINUX_PERF_EVENT_H */ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index b00df4c79c6..07bff666e65 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -85,7 +85,7 @@ #define PR_SET_TIMERSLACK 29 #define PR_GET_TIMERSLACK 30 -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8af3d249170..8b265a8986d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,7 +100,7 @@ struct robust_list_head; struct bio; struct fs_struct; struct bts_context; -struct perf_counter_context; +struct perf_event_context; /* * List of flags we want to share for kernel threads, @@ -701,7 +701,7 @@ struct user_struct { #endif #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif }; @@ -1449,10 +1449,10 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif -#ifdef CONFIG_PERF_COUNTERS - struct perf_counter_context *perf_counter_ctxp; - struct mutex perf_counter_mutex; - struct list_head perf_counter_list; +#ifdef CONFIG_PERF_EVENTS + struct perf_event_context *perf_event_ctxp; + struct mutex perf_event_mutex; + struct list_head perf_event_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a8e37821cc6..02f19f9a76c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,7 +55,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; -struct perf_counter_attr; +struct perf_event_attr; #include #include @@ -885,7 +885,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage long sys_perf_counter_open( - struct perf_counter_attr __user *attr_uptr, +asmlinkage long sys_perf_event_open( + struct perf_event_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 72a3b437b82..ec91e78244f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -378,7 +378,7 @@ static inline int ftrace_get_offsets_##call( \ #ifdef CONFIG_EVENT_PROFILE /* - * Generate the functions needed for tracepoint perf_counter support. + * Generate the functions needed for tracepoint perf_event support. * * NOTE: The insertion profile callback (ftrace_profile_) is defined later * @@ -656,7 +656,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * { * struct ftrace_data_offsets_ __maybe_unused __data_offsets; * struct ftrace_event_call *event_call = &event_; - * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * extern void perf_tp_event(int, u64, u64, void *, int); * struct ftrace_raw_##call *entry; * u64 __addr = 0, __count = 1; * unsigned long irq_flags; @@ -691,7 +691,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * * <- affect our values * - * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * perf_tp_event(event_call->id, __addr, __count, entry, * __entry_size); <- submit them to perf counter * } while (0); * @@ -712,7 +712,7 @@ static void ftrace_profile_##call(proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_event_call *event_call = &event_##call; \ - extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ @@ -742,7 +742,7 @@ static void ftrace_profile_##call(proto) \ \ { assign; } \ \ - perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + perf_tp_event(event_call->id, __addr, __count, entry,\ __entry_size); \ } while (0); \ \ diff --git a/init/Kconfig b/init/Kconfig index 8e8b76d8a27..cfdf5c32280 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -915,17 +915,17 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. -config HAVE_PERF_COUNTERS +config HAVE_PERF_EVENTS bool help See tools/perf/design.txt for details. menu "Performance Counters" -config PERF_COUNTERS +config PERF_EVENTS bool "Kernel Performance Counters" default y if PROFILING - depends on HAVE_PERF_COUNTERS + depends on HAVE_PERF_EVENTS select ANON_INODES help Enable kernel support for performance counter hardware. @@ -947,7 +947,7 @@ config PERF_COUNTERS config EVENT_PROFILE bool "Tracepoint profiling sources" - depends on PERF_COUNTERS && EVENT_TRACING + depends on PERF_EVENTS && EVENT_TRACING default y help Allow the use of tracepoints as software performance counters. diff --git a/kernel/Makefile b/kernel/Makefile index 3d9c7e27e3f..e26a546eac4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,7 +96,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/exit.c b/kernel/exit.c index ae5d8660ddf..e47ee8a0613 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -981,7 +981,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index bfee931ee3f..2cebfb23b0b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,7 @@ #include #include #include -#include +#include #include #include @@ -1078,7 +1078,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1253,7 +1253,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1280,7 +1280,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c deleted file mode 100644 index 62de0db8092..00000000000 --- a/kernel/perf_counter.c +++ /dev/null @@ -1,5000 +0,0 @@ -/* - * Performance counter core code - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Each CPU has a list of per CPU counters: - */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_counters __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; - -/* - * perf counter paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu counters for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_counter_paranoid __read_mostly = 1; - -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_counter_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_counter_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_counter_paranoid > 1; -} - -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ - -/* - * max perf counter sample rate - */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; - -static atomic64_t perf_counter_id; - -/* - * Lock for (sysadmin-configurable) counter reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); - -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} - -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } - -int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - return 0; -} - -void __weak perf_counter_print_debug(void) { } - -static DEFINE_PER_CPU(int, perf_disable_count); - -void __perf_disable(void) -{ - __get_cpu_var(perf_disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(perf_disable_count); -} - -void perf_disable(void) -{ - __perf_disable(); - hw_perf_disable(); -} - -void perf_enable(void) -{ - if (__perf_enable()) - hw_perf_enable(); -} - -static void get_ctx(struct perf_counter_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_counter_context *ctx; - - ctx = container_of(head, struct perf_counter_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_counter_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_counter_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -/* - * If we inherit counters we want to return the parent counter id - * to userspace. - */ -static u64 primary_counter_id(struct perf_counter *counter) -{ - u64 id = counter->id; - - if (counter->parent) - id = counter->parent->id; - - return id; -} - -/* - * Get the perf_counter_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_counter_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) -{ - struct perf_counter_context *ctx; - - rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_counter_ctxp); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) -{ - struct perf_counter_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_counter_context *ctx) -{ - unsigned long flags; - - spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); -} - -/* - * Add a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *group_leader = counter->group_leader; - - /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group - * leader's sibling list: - */ - if (group_leader == counter) - list_add_tail(&counter->group_entry, &ctx->group_list); - else { - list_add_tail(&counter->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - } - - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Remove a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *sibling, *tmp; - - if (list_empty(&counter->group_entry)) - return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) - ctx->nr_stat--; - - list_del_init(&counter->group_entry); - list_del_rcu(&counter->event_entry); - - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; - - /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them - * to the context list directly: - */ - list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, group_entry) { - - list_move_tail(&sibling->group_entry, &ctx->group_list); - sibling->group_leader = sibling; - } -} - -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; - } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter_sched_out(group_counter, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) - counter_sched_out(counter, cpuctx, ctx); - - if (group_counter->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance counter - * - * We disable the counter on the hardware level first. After that we - * remove it from the context list. - */ -static void __perf_counter_remove_from_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. - */ - perf_disable(); - - counter_sched_out(counter, cpuctx, ctx); - - list_del_counter(counter, ctx); - - if (!ctx->task) { - /* - * Allow more per task counters with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); - } - - perf_enable(); - spin_unlock(&ctx->lock); -} - - -/* - * Remove the counter from a task's (or a CPU's) list of counters. - * - * Must be called with ctx->mutex held. - * - * CPU counters are removed with a smp call. For task counters we only - * call when the task is on a CPU. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_counter_remove_from_context(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are removed via an smp call and - * the removal is always sucessful. - */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * If the context is active we need to retry the smp call. - */ - if (ctx->nr_active && !list_empty(&counter->group_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not - * succeed. - */ - if (!list_empty(&counter->group_entry)) { - list_del_counter(counter, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_counter_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a counter. - */ -static void update_counter_times(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - u64 run_end; - - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) - return; - - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; - - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time; - - counter->total_time_running = run_end - counter->tstamp_running; -} - -/* - * Update total_time_enabled and total_time_running for all counters in a group. - */ -static void update_group_times(struct perf_counter *leader) -{ - struct perf_counter *counter; - - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, group_entry) - update_counter_times(counter); -} - -/* - * Cross CPU call to disable a performance counter - */ -static void __perf_counter_disable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - - /* - * If the counter is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx); - update_group_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); - else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock(&ctx->lock); -} - -/* - * Disable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. - */ -static void perf_counter_disable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); - return; - } - - retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); - - spin_lock_irq(&ctx->lock); - /* - * If the counter is still active, we need to retry the cross-call. - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock_irq(&ctx->lock); -} - -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - if (counter->state <= PERF_COUNTER_STATE_OFF) - return 0; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; - return -EAGAIN; - } - - counter->tstamp_running += ctx->time - counter->tstamp_stopped; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (counter->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - - list_for_each_entry(counter, &leader->sibling_list, group_entry) - if (!is_software_counter(counter)) - return 0; - - return 1; -} - -/* - * Work out whether we can put this counter group on the CPU now. - */ -static int group_can_go_on(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software counters can always go on. - */ - if (is_software_only_group(counter)) - return 1; - /* - * If an exclusive group is already on, no other hardware - * counters can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * counters on the CPU, it can't go on. - */ - if (counter->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; -} - -/* - * Cross CPU call to install and enable a performance counter - * - * Must be called with ctx->mutex held - */ -static void __perf_install_in_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int cpu = smp_processor_id(); - int err; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); - - add_counter_to_ctx(counter, ctx); - - /* - * Don't put the counter on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. - */ - if (!group_can_go_on(counter, cpuctx, 1)) - err = -EEXIST; - else - err = counter_sched_in(counter, cpuctx, ctx, cpu); - - if (err) { - /* - * This counter couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - - spin_unlock(&ctx->lock); -} - -/* - * Attach a performance counter to a context - * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. - * - * If the counter is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. - */ -static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, - int cpu) -{ - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are installed via an smp call and - * the install is always sucessful. - */ - smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_install_in_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * we need to retry the smp call. - */ - if (ctx->is_active && list_empty(&counter->group_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not - * succeed. - */ - if (list_empty(&counter->group_entry)) - add_counter_to_ctx(counter, ctx); - spin_unlock_irq(&ctx->lock); -} - -/* - * Put a counter into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_counter_mark_enabled(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - struct perf_counter *sub; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - list_for_each_entry(sub, &counter->sibling_list, group_entry) - if (sub->state >= PERF_COUNTER_STATE_INACTIVE) - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; -} - -/* - * Cross CPU call to enable a performance counter - */ -static void __perf_counter_enable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int err; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto unlock; - __perf_counter_mark_enabled(counter, ctx); - - /* - * If the counter is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(counter, cpuctx, 1)) { - err = -EEXIST; - } else { - perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - perf_enable(); - } - - if (err) { - /* - * If this counter can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - unlock: - spin_unlock(&ctx->lock); -} - -/* - * Enable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. - */ -static void perf_counter_enable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); - return; - } - - spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto out; - - /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; - - retry: - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); - - spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the counter is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) - goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_OFF) - __perf_counter_mark_enabled(counter, ctx); - - out: - spin_unlock_irq(&ctx->lock); -} - -static int perf_counter_refresh(struct perf_counter *counter, int refresh) -{ - /* - * not supported on inherited counters - */ - if (counter->attr.inherit) - return -EINVAL; - - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); - - return 0; -} - -void __perf_counter_sched_out(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_counter *counter; - - spin_lock(&ctx->lock); - ctx->is_active = 0; - if (likely(!ctx->nr_counters)) - goto out; - update_context_time(ctx); - - perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); - else - group_sched_out(counter, cpuctx, ctx); - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters - * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_counter_read(void *counter); - -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) -{ - u64 value; - - if (!counter->attr.inherit_stat) - return; - - /* - * Update the counter value, we cannot use perf_counter_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we - * don't need to use it. - */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); - break; - - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the counter - * values when we flip the contexts. - */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); - - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) -{ - struct perf_counter *counter, *next_counter; - - if (!ctx->nr_stat) - return; - - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); - - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); - - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { - - __perf_counter_sync_stat(counter, next_counter); - - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); - } -} - -/* - * Called from scheduler to remove the counters of the current task, - * with interrupts disabled. - * - * We stop each counter and update the counter value in counter->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. - */ -void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; - struct pt_regs *regs; - int do_switch = 1; - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); - - if (likely(!ctx || !cpuctx->task_ctx)) - return; - - update_context_time(ctx); - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp - */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_counter_sync_stat(ctx, next_ctx); - } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; - } -} - -/* - * Called with IRQs disabled - */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) -{ - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); -} - -static void -__perf_counter_sched_in(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter *counter; - int can_add_hw = 1; - - spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_counters)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) - continue; - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); - else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); - } - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; - } - } - - list_for_each_entry(counter, &ctx->group_list, group_entry) { - /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. - */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) - continue; - - /* - * Listen to the 'cpu' scheduling filter constraint - * of counters: - */ - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_counter_sched_in(ctx, cpuctx, cpu); - cpuctx->task_ctx = ctx; -} - -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter_context *ctx = &cpuctx->ctx; - - __perf_counter_sched_in(ctx, cpuctx, cpu); -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_counter *counter, int enable); - -static void perf_adjust_period(struct perf_counter *counter, u64 events) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period, sample_period; - s64 delta; - - events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; -} - -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - struct hw_perf_counter *hwc; - u64 interrupts, freq; - - spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - continue; - - hwc = &counter->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle counters on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; - } - - if (!counter->attr.freq || !counter->attr.sample_freq) - continue; - - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(counter, freq * interrupts); - - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - counter->pmu->disable(counter); - atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); - perf_enable(); - } - } - spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's counters: - */ -static void rotate_ctx(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (!ctx->nr_counters) - return; - - spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group counters too): - */ - perf_disable(); - list_for_each_entry(counter, &ctx->group_list, group_entry) { - list_move_tail(&counter->group_entry, &ctx->group_list); - break; - } - perf_enable(); - - spin_unlock(&ctx->lock); -} - -void perf_counter_task_tick(struct task_struct *curr, int cpu) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - - if (!atomic_read(&nr_counters)) - return; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; - - perf_ctx_adjust_freq(&cpuctx->ctx); - if (ctx) - perf_ctx_adjust_freq(ctx); - - perf_counter_cpu_sched_out(cpuctx); - if (ctx) - __perf_counter_task_sched_out(ctx); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - perf_counter_cpu_sched_in(cpuctx, cpu); - if (ctx) - perf_counter_task_sched_in(curr, cpu); -} - -/* - * Enable all of a task's counters that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_counter_enable_on_exec(struct task_struct *task) -{ - struct perf_counter_context *ctx; - struct perf_counter *counter; - unsigned long flags; - int enabled = 0; - - local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) - goto out; - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (!counter->attr.enable_on_exec) - continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - continue; - __perf_counter_mark_enabled(counter, ctx); - enabled = 1; - } - - /* - * Unclone this context if we enabled any counter. - */ - if (enabled) - unclone_ctx(ctx); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(task, smp_processor_id()); - out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware counter - */ -static void __perf_counter_read(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); - local_irq_restore(flags); -} - -static u64 perf_counter_read(struct perf_counter *counter) -{ - /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); - } - - return atomic64_read(&counter->count); -} - -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->group_list); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); - ctx->task = task; -} - -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) -{ - struct perf_counter_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - /* - * If cpu is not a wildcard then this is a percpu counter: - */ - if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - if (cpu < 0 || cpu > num_possible_cpus()) - return ERR_PTR(-EINVAL); - - /* - * We could be clever and allow to attach a counter to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_isset(cpu, cpu_online_map)) - return ERR_PTR(-ENODEV); - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - - return ctx; - } - - rcu_read_lock(); - if (!pid) - task = current; - else - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* - * Can't attach counters to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - retry: - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - err = -ENOMEM; - if (!ctx) - goto errout; - __perf_counter_init_context(ctx, task); - get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ - kfree(ctx); - goto retry; - } - get_task_struct(task); - } - - put_task_struct(task); - return ctx; - - errout: - put_task_struct(task); - return ERR_PTR(err); -} - -static void free_counter_rcu(struct rcu_head *head) -{ - struct perf_counter *counter; - - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); -} - -static void perf_pending_sync(struct perf_counter *counter); - -static void free_counter(struct perf_counter *counter) -{ - perf_pending_sync(counter); - - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); - } - - if (counter->output) { - fput(counter->output->filp); - counter->output = NULL; - } - - if (counter->destroy) - counter->destroy(counter); - - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); -} - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; - - file->private_data = NULL; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&ctx->mutex); - - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); - - free_counter(counter); - - return 0; -} - -static int perf_counter_read_size(struct perf_counter *counter) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - -static u64 perf_counter_read_value(struct perf_counter *counter) -{ - struct perf_counter *child; - u64 total = 0; - - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); - - return total; -} - -static int perf_counter_read_entry(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} - -static int perf_counter_read_group(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - struct perf_counter *leader = counter->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_counter_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; - - size += err; - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_counter_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; - - size += err; - } - - return size; -} - -static int perf_counter_read_one(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - u64 values[4]; - int n = 0; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance counter - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) -{ - u64 read_format = counter->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a counter that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - return 0; - - if (count < perf_counter_read_size(counter)) - return -ENOSPC; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); - else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_counter *counter = file->private_data; - - return perf_read_hw(counter, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_counter *counter = file->private_data; - struct perf_mmap_data *data; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - events = atomic_xchg(&data->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &counter->waitq, wait); - - return events; -} - -static void perf_counter_reset(struct perf_counter *counter) -{ - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); -} - -/* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. - */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter *child; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) - func(child); - mutex_unlock(&counter->child_mutex); -} - -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, group_entry) - perf_counter_for_each_child(counter, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) -{ - struct perf_counter_context *ctx = counter->ctx; - unsigned long size; - int ret = 0; - u64 value; - - if (!counter->attr.sample_period) - return -EINVAL; - - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) - return -EFAULT; - - if (!value) - return -EINVAL; - - spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - counter->attr.sample_freq = value; - } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; - } -unlock: - spin_unlock_irq(&ctx->lock); - - return ret; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); - u32 flags = arg; - - switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; - break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; - break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; - break; - - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); - - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); - - case PERF_COUNTER_IOC_SET_OUTPUT: - return perf_counter_set_output(counter, arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); - else - perf_counter_for_each_child(counter, func); - - return 0; -} - -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 -#endif - -static int perf_counter_index(struct perf_counter *counter) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; - - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_counter_mmap_page *userpg; - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - userpg = data->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); - - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_counter *counter = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; - - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; - - vmf->page = virt_to_page(data->data_pages[nr]); - } - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) -{ - struct perf_mmap_data *data; - unsigned long size; - int i; - - WARN_ON(atomic_read(&counter->mmap_count)); - - size = sizeof(struct perf_mmap_data); - size += nr_pages * sizeof(void *); - - data = kzalloc(size, GFP_KERNEL); - if (!data) - goto fail; - - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->data_pages[i]) - goto fail_data_pages; - } - - data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - if (counter->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - counter->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - - rcu_assign_pointer(counter->data, data); - - return 0; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); - - free_page((unsigned long)data->user_page); - -fail_user_page: - kfree(data); - -fail: - return -ENOMEM; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void __perf_mmap_data_free(struct rcu_head *rcu_head) -{ - struct perf_mmap_data *data; - int i; - - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - - kfree(data); -} - -static void perf_mmap_data_free(struct perf_counter *counter) -{ - struct perf_mmap_data *data = counter->data; - - WARN_ON(atomic_read(&counter->mmap_count)); - - rcu_assign_pointer(counter->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - atomic_inc(&counter->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - struct user_struct *user = current_user(); - - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); - } -} - -static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_counter *counter = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have data pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (counter->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); - if (ret) - goto unlock; - - atomic_set(&counter->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; - if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; - -unlock: - mutex_unlock(&counter->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf counter wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_counter_wakeup(struct perf_counter *counter) -{ - wake_up_all(&counter->waitq); - - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; - } -} - -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_counter(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); - } - - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); - } -} - -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_counter_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_counter *counter) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return counter->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_counter *counter) -{ - wait_event(counter->waitq, perf_not_pending(counter)); -} - -void perf_counter_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - -/* - * Output - */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!data->writable) - return true; - - mask = (data->nr_pages << PAGE_SHIFT) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->data->poll, POLL_IN); - - if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); - } else - perf_counter_wakeup(handle->counter); -} - -/* - * Curious locking construct. - * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_lock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - int cpu; - - handle->locked = 0; - - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; - - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - handle->locked = 1; -} - -static void perf_output_unlock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; - -again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; - - /* - * NMI can happen here, which means we can miss a done_head update. - */ - - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); - - /* - * Therefore we have to validate we did not indeed do so. - */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - goto again; - } - - if (atomic_xchg(&data->wakeup, 0)) - perf_output_wakeup(handle); -out: - local_irq_restore(handle->flags); -} - -void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) -{ - struct perf_counter *output_counter; - struct perf_mmap_data *data; - unsigned long tail, offset, head; - int have_lost; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited counters we send all the output towards the parent. - */ - if (counter->parent) - counter = counter->parent; - - output_counter = rcu_dereference(counter->output); - if (output_counter) - counter = output_counter; - - data = rcu_dereference(counter->data); - if (!data) - goto out; - - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->sample = sample; - - if (!data->nr_pages) - goto fail; - - have_lost = atomic_read(&data->lost); - if (have_lost) - size += sizeof(lost_event); - - perf_output_lock(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(data->user_page->data_tail); - smp_rmb(); - offset = head = atomic_long_read(&data->head); - head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) - goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); - - handle->offset = offset; - handle->head = head; - - if (head - tail > data->watermark) - atomic_set(&data->wakeup, 1); - - if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; - lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; - lost_event.lost = atomic_xchg(&data->lost, 0); - - perf_output_put(handle, lost_event); - } - - return 0; - -fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_counter *counter = handle->counter; - struct perf_mmap_data *data = handle->data; - - int wakeup_events = counter->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); - if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); - } - } - - perf_output_unlock(handle); - rcu_read_unlock(); -} - -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_tgid_nr_ns(p, counter->ns); -} - -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_pid_nr_ns(p, counter->ns); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - u64 read_format = counter->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = atomic64_read(&counter->count); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; - - if (leader != counter) - leader->pmu->read(leader); - - values[n++] = atomic64_read(&leader->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - if (sub != counter) - sub->pmu->read(sub); - - values[n++] = atomic64_read(&sub->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); - else - perf_output_read_one(handle, counter); -} - -void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter) -{ - u64 sample_type = data->type; - - perf_output_put(handle, *header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(handle, data->ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); - - perf_output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(handle, raw); - } - } -} - -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter, - struct pt_regs *regs) -{ - u64 sample_type = counter->attr.sample_type; - - data->type = sample_type; - - header->type = PERF_EVENT_SAMPLE; - header->size = sizeof(*header); - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - - if (sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); - - header->size += sizeof(data->ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_counter_pid(counter, current); - data->tid_entry.tid = perf_counter_tid(counter, current); - - header->size += sizeof(data->tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - data->time = perf_clock(); - - header->size += sizeof(data->time); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header->size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - data->id = primary_counter_id(counter); - - header->size += sizeof(data->id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = counter->id; - - header->size += sizeof(data->stream_id); - } - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - - header->size += sizeof(data->cpu_entry); - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header->size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - header->size += perf_counter_read_size(counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - data->callchain = perf_callchain(regs); - - if (data->callchain) - size += data->callchain->nr; - - header->size += size * sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; - } -} - -static void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_output_handle handle; - struct perf_event_header header; - - perf_prepare_sample(&header, data, counter, regs); - - if (perf_output_begin(&handle, counter, header.size, nmi, 1)) - return; - - perf_output_sample(&handle, &header, data, counter); - - perf_output_end(&handle); -} - -/* - * read event - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_counter_read_event(struct perf_counter *counter, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_read_event read_event = { - .header = { - .type = PERF_EVENT_READ, - .misc = 0, - .size = sizeof(read_event) + perf_counter_read_size(counter), - }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), - }; - int ret; - - ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, read_event); - perf_output_read(&handle, counter); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_counter_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - u64 time; - } event; -}; - -static void perf_counter_task_output(struct perf_counter *counter, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - int size; - struct task_struct *task = task_event->task; - int ret; - - size = task_event->event.header.size; - ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); - - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); - - task_event->event.time = perf_clock(); - - perf_output_put(&handle, task_event->event); - - perf_output_end(&handle); -} - -static int perf_counter_task_match(struct perf_counter *counter) -{ - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) - return 1; - - return 0; -} - -static void perf_counter_task_ctx(struct perf_counter_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); - } - rcu_read_unlock(); -} - -static void perf_counter_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); - if (ctx) - perf_counter_task_ctx(ctx, task_event); - rcu_read_unlock(); -} - -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event = { - .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, - .misc = 0, - .size = sizeof(task_event.event), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - }, - }; - - perf_counter_task_event(&task_event); -} - -void perf_counter_fork(struct task_struct *task) -{ - perf_counter_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event; -}; - -static void perf_counter_comm_output(struct perf_counter *counter, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); - - perf_output_put(&handle, comm_event->event); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - perf_output_end(&handle); -} - -static int perf_counter_comm_match(struct perf_counter *counter) -{ - if (counter->attr.comm) - return 1; - - return 0; -} - -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); - } - rcu_read_unlock(); -} - -static void perf_counter_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - unsigned int size; - char comm[TASK_COMM_LEN]; - - memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event.header.size = sizeof(comm_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_comm_ctx(ctx, comm_event); - rcu_read_unlock(); -} - -void perf_counter_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); - - if (!atomic_read(&nr_comm_counters)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event = { - .header = { - .type = PERF_EVENT_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_counter_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event; -}; - -static void perf_counter_mmap_output(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); - - perf_output_put(&handle, mmap_event->event); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - perf_output_end(&handle); -} - -static int perf_counter_mmap_match(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - if (counter->attr.mmap) - return 1; - - return 0; -} - -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, - struct perf_mmap_event *mmap_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); - } - rcu_read_unlock(); -} - -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event.header.size = sizeof(mmap_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); - rcu_read_unlock(); - - kfree(buf); -} - -void __perf_counter_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event = { - .header = { - .type = PERF_EVENT_MMAP, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_counter *counter, int enable) -{ - struct perf_output_handle handle; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_EVENT_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = perf_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, - }; - - if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; - - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_output_end(&handle); -} - -/* - * Generic counter overflow handling, sampling. - */ - -static int __perf_counter_overflow(struct perf_counter *counter, int nmi, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) -{ - int events = atomic_read(&counter->event_limit); - struct hw_perf_counter *hwc = &counter->hw; - int ret = 0; - - throttle = (throttle && counter->pmu->unthrottle != NULL); - - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling counters even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: - */ - ret = 1; - } - } - - if (counter->attr.freq) { - u64 now = perf_clock(); - s64 delta = now - hwc->freq_stamp; - - hwc->freq_stamp = now; - - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * counters - */ - - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { - ret = 1; - counter->pending_kill = POLL_HUP; - if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); - } else - perf_counter_disable(counter); - } - - perf_counter_output(counter, nmi, data, regs); - return ret; -} - -int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - return __perf_counter_overflow(counter, nmi, 1, data, regs); -} - -/* - * Generic software counter infrastructure - */ - -/* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swcounter_set_period(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = atomic64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_counter *hwc = &counter->hw; - int throttle = 0; - u64 overflow; - - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (__perf_counter_overflow(counter, nmi, throttle, - data, regs)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - throttle = 1; - } -} - -static void perf_swcounter_unthrottle(struct perf_counter *counter) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_counter *hwc = &counter->hw; - - atomic64_add(nr, &counter->count); - - if (!hwc->sample_period) - return; - - if (!regs) - return; - - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data, regs); -} - -static int perf_swcounter_is_counting(struct perf_counter *counter) -{ - /* - * The counter is active, we're good! - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - return 1; - - /* - * The counter is off/error, not counting. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) - return 0; - - /* - * The counter is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (counter->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - -static int perf_swcounter_match(struct perf_counter *counter, - enum perf_type_id type, - u32 event_id, struct pt_regs *regs) -{ - if (!perf_swcounter_is_counting(counter)) - return 0; - - if (counter->attr.type != type) - return 0; - if (counter->attr.config != event_id) - return 0; - - if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) - return 0; - - if (counter->attr.exclude_kernel && !user_mode(regs)) - return 0; - } - - return 1; -} - -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event_id, regs)) - perf_swcounter_add(counter, nr, nmi, data, regs); - } - rcu_read_unlock(); -} - -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) -{ - if (in_nmi()) - return &cpuctx->recursion[3]; - - if (in_irq()) - return &cpuctx->recursion[2]; - - if (in_softirq()) - return &cpuctx->recursion[1]; - - return &cpuctx->recursion[0]; -} - -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; - - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data, regs); - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs); - rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); -} - -void __perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data = { - .addr = addr, - }; - - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, - &data, regs); -} - -static void perf_swcounter_read(struct perf_counter *counter) -{ -} - -static int perf_swcounter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - - if (hwc->sample_period) { - hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); - } - return 0; -} - -static void perf_swcounter_disable(struct perf_counter *counter) -{ -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, -}; - -/* - * hrtimer based swcounter callback - */ - -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((counter->attr.exclude_kernel || !regs) && - !counter->attr.exclude_user) - regs = task_pt_regs(current); - - if (regs) { - if (perf_counter_overflow(counter, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -/* - * Software counter: cpu wall time clock - */ - -static void cpu_clock_perf_counter_update(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); -} - -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int cpu = raw_smp_processor_id(); - - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); -} - -static void cpu_clock_perf_counter_read(struct perf_counter *counter) -{ - cpu_clock_perf_counter_update(counter); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, -}; - -/* - * Software counter: task time clock - */ - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) -{ - u64 prev; - s64 delta; - - prev = atomic64_xchg(&counter->hw.prev_count, now); - delta = now - prev; - atomic64_add(delta, &counter->count); -} - -static int task_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 now; - - now = counter->ctx->time; - - atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void task_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); - -} - -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; - } - - task_clock_perf_counter_update(counter, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, -}; - -#ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) -{ - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - struct perf_sample_data data = { - .addr = addr, - .raw = &raw, - }; - - struct pt_regs *regs = get_irq_regs(); - - if (!regs) - regs = task_pt_regs(current); - - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); -} -EXPORT_SYMBOL_GPL(perf_tpcounter_event); - -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); - -static void tp_perf_counter_destroy(struct perf_counter *counter) -{ - ftrace_profile_disable(counter->attr.config); -} - -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - if (ftrace_profile_enable(counter->attr.config)) - return NULL; - - counter->destroy = tp_perf_counter_destroy; - - return &perf_ops_generic; -} -#else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} -#endif - -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_counter_destroy(struct perf_counter *counter) -{ - u64 event_id = counter->attr.config; - - WARN_ON(counter->parent); - - atomic_dec(&perf_swcounter_enabled[event_id]); -} - -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) -{ - const struct pmu *pmu = NULL; - u64 event_id = counter->attr.config; - - /* - * Software counters (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. - */ - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. - */ - if (counter->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event_id]); - counter->destroy = sw_perf_counter_destroy; - } - pmu = &perf_ops_generic; - break; - } - - return pmu; -} - -/* - * Allocate and initialize a counter structure - */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, - int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, - gfp_t gfpflags) -{ - const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - long err; - - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) - return ERR_PTR(-ENOMEM); - - /* - * Single counters are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = counter; - - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); - - INIT_LIST_HEAD(&counter->group_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); - - mutex_init(&counter->mmap_mutex); - - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; - - counter->parent = parent_counter; - - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); - - counter->state = PERF_COUNTER_STATE_INACTIVE; - - if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; - - pmu = NULL; - - hwc = &counter->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - atomic64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); - break; - - default: - break; - } -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); - return ERR_PTR(err); - } - - counter->pmu = pmu; - - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); - } - - return counter; -} - -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd) -{ - struct perf_counter *output_counter = NULL; - struct file *output_file = NULL; - struct perf_counter *old_output; - int fput_needed = 0; - int ret = -EINVAL; - - if (!output_fd) - goto set; - - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) - goto out; - - output_counter = output_file->private_data; - - /* Don't chain output fds */ - if (output_counter->output) - goto out; - - /* Don't set an output fd when we already have an output channel */ - if (counter->data) - goto out; - - atomic_long_inc(&output_file->f_count); - -set: - mutex_lock(&counter->mmap_mutex); - old_output = counter->output; - rcu_assign_pointer(counter->output, output_counter); - mutex_unlock(&counter->mmap_mutex); - - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this counter. - */ - synchronize_rcu(); - fput(old_output->filp); - } - - ret = 0; -out: - fput_light(output_file, fput_needed); - return ret; -} - -/** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu - * - * @attr_uptr: event type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader counter fd - */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; - struct file *group_file = NULL; - int fput_needed = 0; - int fput_needed2 = 0; - int err; - - /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) - return -EINVAL; - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - /* - * Look up the group leader (we will attach this counter to it): - */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { - err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - - group_leader = group_file->private_data; - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_put_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (group_leader->ctx != ctx) - goto err_put_context; - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); - err = PTR_ERR(counter); - if (IS_ERR(counter)) - goto err_put_context; - - err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); - if (err < 0) - goto err_free_put_context; - - counter_file = fget_light(err, &fput_needed2); - if (!counter_file) - goto err_free_put_context; - - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_counter_set_output(counter, group_fd); - if (err) - goto err_fput_free_put_context; - } - - counter->filp = counter_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); - ++ctx->generation; - mutex_unlock(&ctx->mutex); - - counter->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); - -err_fput_free_put_context: - fput_light(counter_file, fput_needed2); - -err_free_put_context: - if (err < 0) - kfree(counter); - -err_put_context: - if (err < 0) - put_ctx(ctx); - - fput_light(group_file, fput_needed); - - return err; -} - -/* - * inherit a counter from parent task to child task: - */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *child_counter; - - /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; - - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, - GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent counter, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; - - /* - * Link it up in the child's context: - */ - add_counter_to_ctx(child_counter, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_counter->filp->f_count); - - /* - * Link this into the parent counter's child list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - return child_counter; -} - -static int inherit_group(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; - - leader = inherit_counter(parent_counter, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, group_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static void sync_child_counter(struct perf_counter *child_counter, - struct task_struct *child) -{ - struct perf_counter *parent_counter = child_counter->parent; - u64 child_val; - - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); - - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); - - /* - * Remove this counter from the parent's list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - /* - * Release the parent counter, if this was the last - * reference to it. - */ - fput(parent_counter->filp); -} - -static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, - struct task_struct *child) -{ - struct perf_counter *parent_counter; - - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); - - parent_counter = child_counter->parent; - /* - * It can happen that parent exits first, and has counters - * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. - */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); - } -} - -/* - * When a child task exits, feed back counter values to parent counters. - */ -void perf_counter_exit_task(struct task_struct *child) -{ - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. - */ - perf_counter_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); - -again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->group_list, - group_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); - - /* - * If the last counter was a group counter, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->group_list)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. - */ -void perf_counter_free_task(struct task_struct *task) -{ - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; - - if (!ctx) - return; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) { - struct perf_counter *parent = counter->parent; - - if (WARN_ON_ONCE(!parent)) - continue; - - mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - list_del_counter(counter, ctx); - free_counter(counter); - } - - if (!list_empty(&ctx->group_list)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - -/* - * Initialize the perf_counter context in task_struct - */ -int perf_counter_init_task(struct task_struct *child) -{ - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; - struct task_struct *parent = current; - int inherited_all = 1; - int ret = 0; - - child->perf_counter_ctxp = NULL; - - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); - - if (likely(!parent->perf_counter_ctxp)) - return 0; - - /* - * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; - get_task_struct(child); - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) - continue; - - if (!counter->attr.inherit) { - inherited_all = 0; - continue; - } - - ret = inherit_group(counter, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; - break; - } - } - - if (inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of counters and the generation - * count can't have changed since we took the mutex. - */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - - return ret; -} - -static void __cpuinit perf_counter_init_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); - - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - hw_perf_counter_setup(cpu); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; - - list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) - __perf_counter_remove_from_context(counter); -} -static void perf_counter_exit_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); -} -#else -static inline void perf_counter_exit_cpu(int cpu) { } -#endif - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - -void __init perf_counter_init(void) -{ - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_counters) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_counters", -}; - -static int __init perf_counter_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); -} -device_initcall(perf_counter_sysfs_init); diff --git a/kernel/perf_event.c b/kernel/perf_event.c new file mode 100644 index 00000000000..6e8b99a04e1 --- /dev/null +++ b/kernel/perf_event.c @@ -0,0 +1,5000 @@ +/* + * Performance event core code + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Each CPU has a list of per CPU events: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_events __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf event sample rate + */ +int sysctl_perf_event_sample_rate __read_mostly = 100000; + +static atomic64_t perf_event_id; + +/* + * Lock for (sysadmin-configurable) event reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + return NULL; +} + +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + return 0; +} + +void __weak perf_event_print_debug(void) { } + +static DEFINE_PER_CPU(int, perf_disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(perf_disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(perf_disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_event_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *group_leader = event->group_leader; + + /* + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group + * leader's sibling list: + */ + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); + else { + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; + + if (list_empty(&event->group_entry)) + return; + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); + + if (event->group_leader != event) + event->group_leader->nr_siblings--; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + + list_move_tail(&sibling->group_entry, &ctx->group_list); + sibling->group_leader = sibling; + } +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_event_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + /* + * Protect the list operation against NMI by disabling the + * events on a global level. + */ + perf_disable(); + + event_sched_out(event, cpuctx, ctx); + + list_del_event(event, ctx); + + if (!ctx->task) { + /* + * Allow more per task events with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); + } + + perf_enable(); + spin_unlock(&ctx->lock); +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * Must be called with ctx->mutex held. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_event_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the event safely, if the call above did not + * succeed. + */ + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); + } + spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + event->total_time_enabled = ctx->time - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +/* + * Cross CPU call to disable a performance event + */ +static void __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock(&ctx->lock); +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +static void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_event_disable, event); + + spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock_irq(&ctx->lock); +} + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + struct perf_event *event, *partial_group; + int ret; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + if (event_sched_in(group_event, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + break; + event_sched_out(event, cpuctx, ctx); + } + event_sched_out(group_event, cpuctx, ctx); + + return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. + */ +static int is_software_only_group(struct perf_event *leader) +{ + struct perf_event *event; + + if (!is_software_event(leader)) + return 0; + + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) + return 0; + + return 1; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (is_software_only_group(event)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int cpu = smp_processor_id(); + int err; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no events. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + /* + * Protect the list operation against NMI by disabling the + * events on a global level. NOP for non NMI based events. + */ + perf_disable(); + + add_event_to_ctx(event, ctx); + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx, cpu); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + if (!err && !ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + unlock: + perf_enable(); + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_install_in_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * we need to retry the smp call. + */ + if (ctx->is_active && list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the event safely, if it the call above did not + * succeed. + */ + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); + spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + +/* + * Cross CPU call to enable a performance event + */ +static void __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int err; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + __perf_event_mark_enabled(event, ctx); + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + perf_disable(); + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, + smp_processor_id()); + else + err = event_sched_in(event, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + unlock: + spin_unlock(&ctx->lock); +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +static void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_event_enable, event); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); + + out: + spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + + perf_disable(); + if (ctx->nr_active) { + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event != event->group_leader) + event_sched_out(event, cpuctx, ctx); + else + group_sched_out(event, cpuctx, ctx); + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_read(void *event); + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); + break; + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct pt_regs *regs; + int do_switch = 1; + + regs = task_pt_regs(task); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } +} + +/* + * Called with IRQs disabled + */ +static void __perf_event_task_sched_out(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ + __perf_event_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_event_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; + + spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) + continue; + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) + event_sched_in(event, cpuctx, ctx, cpu); + else { + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } + + list_for_each_entry(event, &ctx->group_list, group_entry) { + /* + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. + */ + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) + continue; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) { + if (event_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } else { + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void perf_event_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + __perf_event_sched_in(ctx, cpuctx, cpu); + cpuctx->task_ctx = ctx; +} + +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + __perf_event_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static void perf_adjust_period(struct perf_event *event, u64 events) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, event->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, freq; + + spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + perf_adjust_period(event, freq * interrupts); + + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } + } + spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (!ctx->nr_events) + return; + + spin_lock(&ctx->lock); + /* + * Rotate the first entry last (works just fine for group events too): + */ + perf_disable(); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); + break; + } + perf_enable(); + + spin_unlock(&ctx->lock); +} + +void perf_event_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + + if (!atomic_read(&nr_events)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = curr->perf_event_ctxp; + + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + + perf_event_cpu_sched_out(cpuctx); + if (ctx) + __perf_event_task_sched_out(ctx); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_cpu_sched_in(cpuctx, cpu); + if (ctx) + perf_event_task_sched_in(curr, cpu); +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) + goto out; + + __perf_event_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) + continue; + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + continue; + __perf_event_mark_enabled(event, ctx); + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + spin_unlock(&ctx->lock); + + perf_event_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + event->pmu->read(event); + update_event_times(event); + local_irq_restore(flags); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); + } + + return atomic64_read(&event->count); +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void +__perf_event_init_context(struct perf_event_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + +static struct perf_event_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + struct task_struct *task; + unsigned long flags; + int err; + + /* + * If cpu is not a wildcard then this is a percpu event: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach events to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + unclone_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + err = -ENOMEM; + if (!ctx) + goto errout; + __perf_event_init_context(ctx, task); + get_ctx(ctx); + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + goto retry; + } + get_task_struct(task); + } + + put_task_struct(task); + return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); +} + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); +} + +static void perf_pending_sync(struct perf_event *event); + +static void free_event(struct perf_event *event) +{ + perf_pending_sync(event); + + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + } + + if (event->output) { + fput(event->output->filp); + event->output = NULL; + } + + if (event->destroy) + event->destroy(event); + + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; + + file->private_data = NULL; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_event_remove_from_context(event); + mutex_unlock(&ctx->mutex); + + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); + + free_event(event); + + return 0; +} + +static int perf_event_read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_event_read_value(struct perf_event *event) +{ + struct perf_event *child; + u64 total = 0; + + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); + + return total; +} + +static int perf_event_read_entry(struct perf_event *event, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_event_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < perf_event_read_size(event)) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!event->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + +int perf_event_set_output(struct perf_event *event, int output_fd); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + userpg = data->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, PAGE_SIZE * nr_pages, + event->attr.wakeup_watermark); + } + if (!data->watermark) + data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); + + rcu_assign_pointer(event->data, data); + + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + int i; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + + perf_mmap_free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + perf_mmap_free_page((unsigned long)data->data_pages[i]); + + kfree(data); +} + +static void perf_mmap_data_free(struct perf_event *event) +{ + struct perf_mmap_data *data = event->data; + + WARN_ON(atomic_read(&event->mmap_count)); + + rcu_assign_pointer(event->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_free(event); + mutex_unlock(&event->mmap_mutex); + } +} + +static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { + ret = -EINVAL; + goto unlock; + } + + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->data); + ret = perf_mmap_data_alloc(event, nr_pages); + if (ret) + goto unlock; + + atomic_set(&event->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + event->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + event->data->writable = 1; + +unlock: + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_event(struct perf_pending_entry *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_event_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_event *event) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return event->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_event *event) +{ + wait_event(event->waitq, perf_not_pending(event)); +} + +void perf_event_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); + } else + perf_event_wakeup(handle->event); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event_id completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + unsigned long head; + int cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_long_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_long_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_event *output_event; + struct perf_mmap_data *data; + unsigned long tail, offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; + + data = rcu_dereference(event->data); + if (!data) + goto out; + + handle->data = data; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!data->nr_pages) + goto fail; + + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + + perf_output_lock(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + offset = head = atomic_long_read(&data->head); + head += size; + if (unlikely(!perf_output_space(data, tail, offset, head))) + goto fail; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + + if (head - tail > data->watermark) + atomic_set(&data->wakeup, 1); + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = event->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + + return 0; + +fail: + atomic_inc(&data->lost); + perf_output_unlock(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&event->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); + else + perf_output_read_one(handle, event); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + if (sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); + } + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + + header->size += sizeof(data->tid_entry); + } + + if (sample_type & PERF_SAMPLE_TIME) { + data->time = perf_clock(); + + header->size += sizeof(data->time); + } + + if (sample_type & PERF_SAMPLE_ADDR) + header->size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_event_id(event); + + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = event->id; + + header->size += sizeof(data->stream_id); + } + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + + header->size += sizeof(data->cpu_entry); + } + + if (sample_type & PERF_SAMPLE_PERIOD) + header->size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + header->size += perf_event_read_size(event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + return; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + perf_event_read_size(event), + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + int size; + struct task_struct *task = task_event->task; + int ret; + + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + task_event->event_id.time = perf_clock(); + + perf_output_put(&handle, task_event->event_id); + + perf_output_end(&handle); +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->attr.comm || event->attr.mmap || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } + rcu_read_unlock(); +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx = task_event->task_ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_event_ctxp); + if (ctx) + perf_event_task_ctx(ctx, task_event); + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } + rcu_read_unlock(); +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + unsigned int size; + char comm[TASK_COMM_LEN]; + + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + if (event->attr.mmap) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + + kfree(buf); +} + +void __perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + throttle = (throttle && event->pmu->unthrottle != NULL); + + if (!throttle) { + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_event_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling events even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the event got enabled again: + */ + ret = 1; + } + } + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); + } else + perf_event_disable(event); + } + + perf_event_output(event, nmi, data, regs); + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + u64 overflow; + + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_unthrottle(struct perf_event *event) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ +} + +static void perf_swevent_add(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + atomic64_add(nr, &event->count); + + if (!hwc->sample_period) + return; + + if (!regs) + return; + + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swevent_overflow(event, nmi, data, regs); +} + +static int perf_swevent_is_counting(struct perf_event *event) +{ + /* + * The event is active, we're good! + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) + return 1; + + /* + * The event is off/error, not counting. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE) + return 0; + + /* + * The event is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. + */ + if (event->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, struct pt_regs *regs) +{ + if (!perf_swevent_is_counting(event)) + return 0; + + if (event->attr.type != type) + return 0; + if (event->attr.config != event_id) + return 0; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 0; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 0; + } + + return 1; +} + +static void perf_swevent_ctx_event(struct perf_event_context *ctx, + enum perf_type_id type, + u32 event_id, u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, regs)) + perf_swevent_add(event, nr, nmi, data, regs); + } + rcu_read_unlock(); +} + +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); + + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, + nr, nmi, data, regs); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); + rcu_read_unlock(); + + barrier(); + (*recursion)--; + +out: + put_cpu_var(perf_cpu_context); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data = { + .addr = addr, + }; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, + &data, regs); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + return 0; +} + +static void perf_swevent_disable(struct perf_event *event) +{ +} + +static const struct pmu perf_ops_generic = { + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, +}; + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + data.addr = 0; + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) + regs = task_pt_regs(current); + + if (regs) { + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_perf_event_update(struct perf_event *event) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); +} + +static int cpu_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void cpu_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); +} + +static void cpu_clock_perf_event_read(struct perf_event *event) +{ + cpu_clock_perf_event_update(event); +} + +static const struct pmu perf_ops_cpu_clock = { + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_perf_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = atomic64_xchg(&event->hw.prev_count, now); + delta = now - prev; + atomic64_add(delta, &event->count); +} + +static int task_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 now; + + now = event->ctx->time; + + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void task_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); + +} + +static void task_clock_perf_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_perf_event_update(event, time); +} + +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, +}; + +#ifdef CONFIG_EVENT_PROFILE +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) +{ + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + struct perf_sample_data data = { + .addr = addr, + .raw = &raw, + }; + + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + ftrace_profile_disable(event->attr.config); +} + +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_paranoid_tracepoint_raw() && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (ftrace_profile_enable(event->attr.config)) + return NULL; + + event->destroy = tp_perf_event_destroy; + + return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + return NULL; +} +#endif + +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + atomic_dec(&perf_swevent_enabled[event_id]); +} + +static const struct pmu *sw_perf_event_init(struct perf_event *event) +{ + const struct pmu *pmu = NULL; + u64 event_id = event->attr.config; + + /* + * Software events (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_TASK_CLOCK: + /* + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. + */ + if (event->ctx->task) + pmu = &perf_ops_task_clock; + else + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + pmu = &perf_ops_generic; + break; + } + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, + int cpu, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, + gfp_t gfpflags) +{ + const struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + event = kzalloc(sizeof(*event), gfpflags); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + atomic64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + switch (attr->type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + pmu = hw_perf_event_init(event); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_event_init(event); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_event_init(event); + break; + + default: + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +int perf_event_set_output(struct perf_event *event, int output_fd) +{ + struct perf_event *output_event = NULL; + struct file *output_file = NULL; + struct perf_event *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_event = output_file->private_data; + + /* Don't chain output fds */ + if (output_event->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (event->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this event. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + int fput_needed = 0; + int fput_needed2 = 0; + int err; + + /* for future expandability... */ + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this event to it): + */ + group_leader = NULL; + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + err = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto err_put_context; + if (group_file->f_op != &perf_fops) + goto err_put_context; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_put_context; + } + + event = perf_event_alloc(&attr, cpu, ctx, group_leader, + NULL, GFP_KERNEL); + err = PTR_ERR(event); + if (IS_ERR(event)) + goto err_put_context; + + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + if (err < 0) + goto err_free_put_context; + + event_file = fget_light(err, &fput_needed2); + if (!event_file) + goto err_free_put_context; + + if (flags & PERF_FLAG_FD_OUTPUT) { + err = perf_event_set_output(event, group_fd); + if (err) + goto err_fput_free_put_context; + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + +err_fput_free_put_context: + fput_light(event_file, fput_needed2); + +err_free_put_context: + if (err < 0) + kfree(event); + +err_put_context: + if (err < 0) + put_ctx(ctx); + + fput_light(group_file, fput_needed); + + return err; +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, + GFP_KERNEL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; + + /* + * Link it up in the child's context: + */ + add_event_to_ctx(child_event, child_ctx); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = atomic64_read(&child_event->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + struct perf_event *parent_event; + + update_event_times(child_event); + perf_event_remove_from_context(child_event); + + parent_event = child_event->parent; + /* + * It can happen that parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped - but otherwise linger. + */ + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); + child->perf_event_ctxp = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->group_list)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); + } + + if (!list_empty(&ctx->group_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + int ret = 0; + + child->perf_event_ctxp = NULL; + + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + if (likely(!parent->perf_event_ctxp)) + return 0; + + /* + * This is executed from the parent task context, so inherit + * events that have been marked for cloning. + * First allocate and initialize a context for the child. + */ + + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { + if (event != event->group_leader) + continue; + + if (!event->attr.inherit) { + inherited_all = 0; + continue; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + if (ret) { + inherited_all = 0; + break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of events and the generation + * count can't have changed since we took the mutex. + */ + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + + return ret; +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + + spin_lock(&perf_resource_lock); + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; + spin_unlock(&perf_resource_lock); + + hw_perf_event_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_event_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); +} +static void perf_event_exit_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_event_init_cpu(cpu); + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_event_setup_online(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, + .priority = 20, +}; + +void __init perf_event_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_events) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + spin_unlock(&perf_resource_lock); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_overcommit = val; + spin_unlock(&perf_resource_lock); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_events", +}; + +static int __init perf_event_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/sched.c b/kernel/sched.c index faf4d463bbf..291c8d213d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include @@ -2059,7 +2059,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2724,7 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -5199,7 +5199,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5415,7 +5415,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -7692,7 +7692,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -9549,7 +9549,7 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097c76f..ea5c3bcac88 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -1511,11 +1511,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 68320f6b07b..515bc230ac2 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -177,4 +177,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1a631ba684a..6ba49c7cb12 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include @@ -964,28 +964,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, diff --git a/kernel/timer.c b/kernel/timer.c index bbb51074680..811e5c39145 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include @@ -1187,7 +1187,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8712ce3c6a0..233f3483ac8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include "trace_output.h" @@ -414,7 +414,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); } while(0); } @@ -476,7 +476,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec.nr = syscall_nr; rec.ret = syscall_get_return_value(current, regs); - perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); + perf_tp_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); } int reg_prof_syscall_exit(char *name) diff --git a/mm/mmap.c b/mm/mmap.c index 26892e346d8..376492ed08f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -1220,7 +1220,7 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: - perf_counter_mmap(vma); + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); @@ -2308,7 +2308,7 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; - perf_counter_mmap(vma); + perf_event_mmap(vma); return 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index d80311baeb2..8bc969d8112 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_counter_mmap(vma); + perf_event_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 0aba8b6e9c5..b5f1953b614 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -318,7 +318,7 @@ export PERL_PATH LIB_FILE=libperf.a -LIB_H += ../../include/linux/perf_counter.h +LIB_H += ../../include/linux/perf_event.h LIB_H += ../../include/linux/rbtree.h LIB_H += ../../include/linux/list.h LIB_H += util/include/linux/list.h diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 043d85b7e25..1ec74161581 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -505,7 +505,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - if (event->header.misc & PERF_EVENT_MISC_KERNEL) { + if (event->header.misc & PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -513,7 +513,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (event->header.misc & PERF_EVENT_MISC_USER) { + } else if (event->header.misc & PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -565,7 +565,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->mmap.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", + dump_printf("%p [%p]: PERF_RECORD_MMAP %d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, @@ -575,7 +575,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) event->mmap.filename); if (thread == NULL || map == NULL) { - dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n"); return 0; } @@ -591,14 +591,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) struct thread *thread; thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -614,7 +614,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->fork.pid, &threads, &last_match); parent = threads__findnew(event->fork.ppid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_FORK: %d:%d\n", + dump_printf("%p [%p]: PERF_RECORD_FORK: %d:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->fork.pid, event->fork.ppid); @@ -627,7 +627,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) return 0; if (!thread || !parent || thread__fork(thread, parent)) { - dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); return -1; } total_fork++; @@ -639,23 +639,23 @@ static int process_event(event_t *event, unsigned long offset, unsigned long head) { switch (event->header.type) { - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return process_mmap_event(event, offset, head); - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_FORK: + case PERF_RECORD_FORK: return process_fork_event(event, offset, head); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 2459e5a22ed..a5a050af8e7 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -77,7 +77,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; static unsigned long mmap_read_head(struct mmap_data *md) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; long head; head = pc->data_head; @@ -88,7 +88,7 @@ static unsigned long mmap_read_head(struct mmap_data *md) static void mmap_write_tail(struct mmap_data *md, unsigned long tail) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; /* * ensure all reads are done before we write the tail out. @@ -233,7 +233,7 @@ static pid_t pid_synthesize_comm_event(pid_t pid, int full) } } - comm_ev.header.type = PERF_EVENT_COMM; + comm_ev.header.type = PERF_RECORD_COMM; size = ALIGN(size, sizeof(u64)); comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); @@ -288,7 +288,7 @@ static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid) while (1) { char bf[BUFSIZ], *pbf = bf; struct mmap_event mmap_ev = { - .header = { .type = PERF_EVENT_MMAP }, + .header = { .type = PERF_RECORD_MMAP }, }; int n; size_t size; @@ -355,7 +355,7 @@ static void synthesize_all(void) static int group_fd; -static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr) +static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr) { struct perf_header_attr *h_attr; @@ -371,7 +371,7 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int static void create_counter(int counter, int cpu, pid_t pid) { - struct perf_counter_attr *attr = attrs + counter; + struct perf_event_attr *attr = attrs + counter; struct perf_header_attr *h_attr; int track = !counter; /* only the first counter needs these */ struct { @@ -417,7 +417,7 @@ static void create_counter(int counter, int cpu, pid_t pid) attr->disabled = 1; try_again: - fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0); + fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0); if (fd[nr_cpu][counter] < 0) { int err = errno; @@ -444,7 +444,7 @@ try_again: printf("\n"); error("perfcounter syscall returned with %d (%s)\n", fd[nr_cpu][counter], strerror(err)); - die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } @@ -478,7 +478,7 @@ try_again: if (multiplex && fd[nr_cpu][counter] != multiplex_fd) { int ret; - ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd); + ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd); assert(ret != -1); } else { event_array[nr_poll].fd = fd[nr_cpu][counter]; @@ -496,7 +496,7 @@ try_again: } } - ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); + ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE); } static void open_counters(int cpu, pid_t pid) @@ -642,7 +642,7 @@ static int __cmd_record(int argc, const char **argv) if (done) { for (i = 0; i < nr_cpu; i++) { for (counter = 0; counter < nr_counters; counter++) - ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE); + ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE); } } } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index cdf9a8d27bb..19669c20088 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1121,7 +1121,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -1158,9 +1158,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) if (comm_list && !strlist__has_entry(comm_list, thread->comm)) return 0; - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -1168,7 +1168,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -1210,7 +1210,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->mmap.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n", + dump_printf("%p [%p]: PERF_RECORD_MMAP %d/%d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, @@ -1221,7 +1221,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) event->mmap.filename); if (thread == NULL || map == NULL) { - dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n"); return 0; } @@ -1238,14 +1238,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm_adjust(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -1262,10 +1262,10 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->fork.pid, &threads, &last_match); parent = threads__findnew(event->fork.ppid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n", + dump_printf("%p [%p]: PERF_RECORD_%s: (%d:%d):(%d:%d)\n", (void *)(offset + head), (void *)(long)(event->header.size), - event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT", + event->header.type == PERF_RECORD_FORK ? "FORK" : "EXIT", event->fork.pid, event->fork.tid, event->fork.ppid, event->fork.ptid); @@ -1276,11 +1276,11 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) if (thread == parent) return 0; - if (event->header.type == PERF_EVENT_EXIT) + if (event->header.type == PERF_RECORD_EXIT) return 0; if (!thread || !parent || thread__fork(thread, parent)) { - dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); return -1; } total_fork++; @@ -1291,7 +1291,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) static int process_lost_event(event_t *event, unsigned long offset, unsigned long head) { - dump_printf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n", + dump_printf("%p [%p]: PERF_RECORD_LOST: id:%Ld: lost:%Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->lost.id, @@ -1305,7 +1305,7 @@ process_lost_event(event_t *event, unsigned long offset, unsigned long head) static int process_read_event(event_t *event, unsigned long offset, unsigned long head) { - struct perf_counter_attr *attr; + struct perf_event_attr *attr; attr = perf_header__find_attr(event->read.id, header); @@ -1319,7 +1319,7 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head) event->read.value); } - dump_printf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n", + dump_printf("%p [%p]: PERF_RECORD_READ: %d %d %s %Lu\n", (void *)(offset + head), (void *)(long)(event->header.size), event->read.pid, @@ -1337,31 +1337,31 @@ process_event(event_t *event, unsigned long offset, unsigned long head) trace_event(event); switch (event->header.type) { - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return process_mmap_event(event, offset, head); - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_FORK: - case PERF_EVENT_EXIT: + case PERF_RECORD_FORK: + case PERF_RECORD_EXIT: return process_task_event(event, offset, head); - case PERF_EVENT_LOST: + case PERF_RECORD_LOST: return process_lost_event(event, offset, head); - case PERF_EVENT_READ: + case PERF_RECORD_READ: return process_read_event(event, offset, head); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 275d79c6627..ea9c15c0cdf 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1573,7 +1573,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -1589,9 +1589,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -1599,7 +1599,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -1626,23 +1626,23 @@ process_event(event_t *event, unsigned long offset, unsigned long head) nr_events++; switch (event->header.type) { - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return 0; - case PERF_EVENT_LOST: + case PERF_RECORD_LOST: nr_lost_chunks++; nr_lost_events += event->lost.lost; return 0; - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_EXIT ... PERF_EVENT_READ: + case PERF_RECORD_EXIT ... PERF_RECORD_READ: return 0; - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MAX: + case PERF_RECORD_MAX: default: return -1; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 61b828236c1..16af2d82e85 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -48,7 +48,7 @@ #include #include -static struct perf_counter_attr default_attrs[] = { +static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES}, @@ -130,11 +130,11 @@ struct stats runtime_cycles_stats; attrs[counter].config == PERF_COUNT_##c) #define ERR_PERF_OPEN \ -"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n" +"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" static void create_perf_stat_counter(int counter, int pid) { - struct perf_counter_attr *attr = attrs + counter; + struct perf_event_attr *attr = attrs + counter; if (scale) attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | @@ -144,7 +144,7 @@ static void create_perf_stat_counter(int counter, int pid) unsigned int cpu; for (cpu = 0; cpu < nr_cpus; cpu++) { - fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0); + fd[cpu][counter] = sys_perf_event_open(attr, -1, cpu, -1, 0); if (fd[cpu][counter] < 0 && verbose) fprintf(stderr, ERR_PERF_OPEN, counter, fd[cpu][counter], strerror(errno)); @@ -154,7 +154,7 @@ static void create_perf_stat_counter(int counter, int pid) attr->disabled = 1; attr->enable_on_exec = 1; - fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0); + fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0); if (fd[0][counter] < 0 && verbose) fprintf(stderr, ERR_PERF_OPEN, counter, fd[0][counter], strerror(errno)); diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 60040639627..4405681b313 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -937,21 +937,21 @@ process_event(event_t *event) switch (event->header.type) { - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event); - case PERF_EVENT_FORK: + case PERF_RECORD_FORK: return process_fork_event(event); - case PERF_EVENT_EXIT: + case PERF_RECORD_EXIT: return process_exit_event(event); - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return queue_sample_event(event); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_MMAP: - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_MMAP: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 4002ccb3675..1ca88896eee 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -901,7 +901,7 @@ struct mmap_data { static unsigned int mmap_read_head(struct mmap_data *md) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; int head; head = pc->data_head; @@ -977,9 +977,9 @@ static void mmap_read_counter(struct mmap_data *md) old += size; - if (event->header.type == PERF_EVENT_SAMPLE) { + if (event->header.type == PERF_RECORD_SAMPLE) { int user = - (event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER; + (event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_USER; process_event(event->ip.ip, md->counter, user); } } @@ -1005,7 +1005,7 @@ int group_fd; static void start_counter(int i, int counter) { - struct perf_counter_attr *attr; + struct perf_event_attr *attr; int cpu; cpu = profile_cpu; @@ -1019,7 +1019,7 @@ static void start_counter(int i, int counter) attr->inherit = (cpu < 0) && inherit; try_again: - fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0); + fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0); if (fd[i][counter] < 0) { int err = errno; @@ -1044,7 +1044,7 @@ try_again: printf("\n"); error("perfcounter syscall returned with %d (%s)\n", fd[i][counter], strerror(err)); - die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } assert(fd[i][counter] >= 0); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 914ab366e36..e9d256e2f47 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -35,14 +35,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -82,7 +82,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -98,9 +98,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -108,7 +108,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -146,19 +146,19 @@ process_event(event_t *event, unsigned long offset, unsigned long head) trace_event(event); switch (event->header.type) { - case PERF_EVENT_MMAP ... PERF_EVENT_LOST: + case PERF_RECORD_MMAP ... PERF_RECORD_LOST: return 0; - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_EXIT ... PERF_EVENT_READ: + case PERF_RECORD_EXIT ... PERF_RECORD_READ: return 0; - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MAX: + case PERF_RECORD_MAX: default: return -1; } diff --git a/tools/perf/design.txt b/tools/perf/design.txt index f71e0d245cb..f1946d107b1 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -18,10 +18,10 @@ underlying hardware counters. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. -The special file descriptor is opened via the perf_counter_open() +The special file descriptor is opened via the perf_event_open() system call: - int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, + int sys_perf_event_open(struct perf_event_hw_event *hw_event_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); @@ -32,9 +32,9 @@ can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. -When creating a new counter fd, 'perf_counter_hw_event' is: +When creating a new counter fd, 'perf_event_hw_event' is: -struct perf_counter_hw_event { +struct perf_event_hw_event { /* * The MSB of the config word signifies if the rest contains cpu * specific (raw) counter configuration data, if unset, the next @@ -93,7 +93,7 @@ specified by 'event_id': /* * Generalized performance counter event types, used by the hw_event.event_id - * parameter of the sys_perf_counter_open() syscall: + * parameter of the sys_perf_event_open() syscall: */ enum hw_event_ids { /* @@ -159,7 +159,7 @@ in size. * reads on the counter should return the indicated quantities, * in increasing order of bit value, after the counter value. */ -enum perf_counter_read_format { +enum perf_event_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1, PERF_FORMAT_TOTAL_TIME_RUNNING = 2, }; @@ -178,7 +178,7 @@ interrupt: * Bits that can be set in hw_event.record_type to request information * in the overflow packets. */ -enum perf_counter_record_format { +enum perf_event_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, PERF_RECORD_TIME = 1U << 2, @@ -228,7 +228,7 @@ these events are recorded in the ring-buffer (see below). The 'comm' bit allows tracking of process comm data on process creation. This too is recorded in the ring-buffer (see below). -The 'pid' parameter to the perf_counter_open() system call allows the +The 'pid' parameter to the perf_event_open() system call allows the counter to be specific to a task: pid == 0: if the pid parameter is zero, the counter is attached to the @@ -258,7 +258,7 @@ The 'flags' parameter is currently unused and must be zero. The 'group_fd' parameter allows counter "groups" to be set up. A counter group has one counter which is the group "leader". The leader -is created first, with group_fd = -1 in the perf_counter_open call +is created first, with group_fd = -1 in the perf_event_open call that creates it. The rest of the group members are created subsequently, with group_fd giving the fd of the group leader. (A single counter on its own is created with group_fd = -1 and is @@ -277,13 +277,13 @@ tracking are logged into a ring-buffer. This ring-buffer is created and accessed through mmap(). The mmap size should be 1+2^n pages, where the first page is a meta-data page -(struct perf_counter_mmap_page) that contains various bits of information such +(struct perf_event_mmap_page) that contains various bits of information such as where the ring-buffer head is. /* * Structure of the page that can be mapped via mmap */ -struct perf_counter_mmap_page { +struct perf_event_mmap_page { __u32 version; /* version number of this structure */ __u32 compat_version; /* lowest version this is compat with */ @@ -317,7 +317,7 @@ struct perf_counter_mmap_page { * Control data for the mmap() data buffer. * * User-space reading this value should issue an rmb(), on SMP capable - * platforms, after reading this value -- see perf_counter_wakeup(). + * platforms, after reading this value -- see perf_event_wakeup(). */ __u32 data_head; /* head in the data section */ }; @@ -327,9 +327,9 @@ NOTE: the hw-counter userspace bits are arch specific and are currently only The following 2^n pages are the ring-buffer which contains events of the form: -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) -#define PERF_EVENT_MISC_OVERFLOW (1 << 2) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (1 << 1) +#define PERF_RECORD_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -353,8 +353,8 @@ enum perf_event_type { * char filename[]; * }; */ - PERF_EVENT_MMAP = 1, - PERF_EVENT_MUNMAP = 2, + PERF_RECORD_MMAP = 1, + PERF_RECORD_MUNMAP = 2, /* * struct { @@ -364,10 +364,10 @@ enum perf_event_type { * char comm[]; * }; */ - PERF_EVENT_COMM = 3, + PERF_RECORD_COMM = 3, /* - * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field + * When header.misc & PERF_RECORD_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* * * struct { @@ -397,7 +397,7 @@ Notification of new events is possible through poll()/select()/epoll() and fcntl() managing signals. Normally a notification is generated for every page filled, however one can -additionally set perf_counter_hw_event.wakeup_events to generate one every +additionally set perf_event_hw_event.wakeup_events to generate one every so many counter overflow events. Future work will include a splice() interface to the ring-buffer. @@ -409,11 +409,11 @@ events but does continue to exist and maintain its count value. An individual counter or counter group can be enabled with - ioctl(fd, PERF_COUNTER_IOC_ENABLE); + ioctl(fd, PERF_EVENT_IOC_ENABLE); or disabled with - ioctl(fd, PERF_COUNTER_IOC_DISABLE); + ioctl(fd, PERF_EVENT_IOC_DISABLE); Enabling or disabling the leader of a group enables or disables the whole group; that is, while the group leader is disabled, none of the @@ -424,16 +424,16 @@ other counter. Additionally, non-inherited overflow counters can use - ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr); + ioctl(fd, PERF_EVENT_IOC_REFRESH, nr); to enable a counter for 'nr' events, after which it gets disabled again. A process can enable or disable all the counter groups that are attached to it, using prctl: - prctl(PR_TASK_PERF_COUNTERS_ENABLE); + prctl(PR_TASK_PERF_EVENTS_ENABLE); - prctl(PR_TASK_PERF_COUNTERS_DISABLE); + prctl(PR_TASK_PERF_EVENTS_DISABLE); This applies to all counters on the current process, whether created by this process or by another, and doesn't affect any counters that @@ -447,11 +447,11 @@ Arch requirements If your architecture does not have hardware performance metrics, you can still use the generic software counters based on hrtimers for sampling. -So to start with, in order to add HAVE_PERF_COUNTERS to your Kconfig, you +So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you will need at least this: - - asm/perf_counter.h - a basic stub will suffice at first + - asm/perf_event.h - a basic stub will suffice at first - support for atomic64 types (and associated helper functions) - - set_perf_counter_pending() implemented + - set_perf_event_pending() implemented If your architecture does have hardware capabilities, you can override the -weak stub hw_perf_counter_init() to register hardware counters. +weak stub hw_perf_event_init() to register hardware counters. diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 2abeb20d0bf..8cc4623afd6 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -52,15 +52,15 @@ #include #include -#include "../../include/linux/perf_counter.h" +#include "../../include/linux/perf_event.h" #include "util/types.h" /* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all * counters in the current task. */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 #ifndef NSEC_PER_SEC # define NSEC_PER_SEC 1000000000ULL @@ -90,12 +90,12 @@ static inline unsigned long long rdclock(void) _min1 < _min2 ? _min1 : _min2; }) static inline int -sys_perf_counter_open(struct perf_counter_attr *attr, +sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { attr->size = sizeof(*attr); - return syscall(__NR_perf_counter_open, attr, pid, cpu, + return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 018d414a09d..2c9c26d6ded 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -1,5 +1,5 @@ -#ifndef __PERF_EVENT_H -#define __PERF_EVENT_H +#ifndef __PERF_RECORD_H +#define __PERF_RECORD_H #include "../perf.h" #include "util.h" #include diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index bb4fca3efcc..e306857b2c2 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -9,7 +9,7 @@ /* * Create new perf.data header attribute: */ -struct perf_header_attr *perf_header_attr__new(struct perf_counter_attr *attr) +struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr) { struct perf_header_attr *self = malloc(sizeof(*self)); @@ -134,7 +134,7 @@ struct perf_file_section { }; struct perf_file_attr { - struct perf_counter_attr attr; + struct perf_event_attr attr; struct perf_file_section ids; }; @@ -320,7 +320,7 @@ u64 perf_header__sample_type(struct perf_header *header) return type; } -struct perf_counter_attr * +struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header) { int i; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 7b0e84a8717..a0761bc7863 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -1,12 +1,12 @@ #ifndef _PERF_HEADER_H #define _PERF_HEADER_H -#include "../../../include/linux/perf_counter.h" +#include "../../../include/linux/perf_event.h" #include #include "types.h" struct perf_header_attr { - struct perf_counter_attr attr; + struct perf_event_attr attr; int ids, size; u64 *id; off_t id_offset; @@ -34,11 +34,11 @@ char *perf_header__find_event(u64 id); struct perf_header_attr * -perf_header_attr__new(struct perf_counter_attr *attr); +perf_header_attr__new(struct perf_event_attr *attr); void perf_header_attr__add_id(struct perf_header_attr *self, u64 id); u64 perf_header__sample_type(struct perf_header *header); -struct perf_counter_attr * +struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 89172fd0038..13ab4b842d4 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -10,7 +10,7 @@ int nr_counters; -struct perf_counter_attr attrs[MAX_COUNTERS]; +struct perf_event_attr attrs[MAX_COUNTERS]; struct event_symbol { u8 type; @@ -48,13 +48,13 @@ static struct event_symbol event_symbols[] = { { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, }; -#define __PERF_COUNTER_FIELD(config, name) \ - ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) +#define __PERF_EVENT_FIELD(config, name) \ + ((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT) -#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) -#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) -#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) -#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) +#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) +#define PERF_EVENT_CONFIG(config) __PERF_EVENT_FIELD(config, CONFIG) +#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) +#define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) static const char *hw_event_names[] = { "cycles", @@ -352,7 +352,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int } static enum event_result -parse_generic_hw_event(const char **str, struct perf_counter_attr *attr) +parse_generic_hw_event(const char **str, struct perf_event_attr *attr) { const char *s = *str; int cache_type = -1, cache_op = -1, cache_result = -1; @@ -417,7 +417,7 @@ parse_single_tracepoint_event(char *sys_name, const char *evt_name, unsigned int evt_length, char *flags, - struct perf_counter_attr *attr, + struct perf_event_attr *attr, const char **strp) { char evt_path[MAXPATHLEN]; @@ -505,7 +505,7 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags) static enum event_result parse_tracepoint_event(const char **strp, - struct perf_counter_attr *attr) + struct perf_event_attr *attr) { const char *evt_name; char *flags; @@ -563,7 +563,7 @@ static int check_events(const char *str, unsigned int i) } static enum event_result -parse_symbolic_event(const char **strp, struct perf_counter_attr *attr) +parse_symbolic_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; unsigned int i; @@ -582,7 +582,7 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_raw_event(const char **strp, struct perf_counter_attr *attr) +parse_raw_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; u64 config; @@ -601,7 +601,7 @@ parse_raw_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_numeric_event(const char **strp, struct perf_counter_attr *attr) +parse_numeric_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; char *endp; @@ -623,7 +623,7 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_event_modifier(const char **strp, struct perf_counter_attr *attr) +parse_event_modifier(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; int eu = 1, ek = 1, eh = 1; @@ -656,7 +656,7 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr) * Symbolic names are (almost) exactly matched. */ static enum event_result -parse_event_symbols(const char **str, struct perf_counter_attr *attr) +parse_event_symbols(const char **str, struct perf_event_attr *attr) { enum event_result ret; @@ -711,7 +711,7 @@ static void store_event_type(const char *orgname) int parse_events(const struct option *opt __used, const char *str, int unset __used) { - struct perf_counter_attr attr; + struct perf_event_attr attr; enum event_result ret; if (strchr(str, ':')) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 60704c15961..30c60811284 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -16,7 +16,7 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config); extern int nr_counters; -extern struct perf_counter_attr attrs[MAX_COUNTERS]; +extern struct perf_event_attr attrs[MAX_COUNTERS]; extern const char *event_name(int ctr); extern const char *__event_name(int type, u64 config); diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index 1fd824c1f1c..af4b0573b37 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -480,12 +480,12 @@ out: } static struct tracepoint_path * -get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters) +get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events) { struct tracepoint_path path, *ppath = &path; int i; - for (i = 0; i < nb_counters; i++) { + for (i = 0; i < nb_events; i++) { if (pattrs[i].type != PERF_TYPE_TRACEPOINT) continue; ppath->next = tracepoint_id_to_path(pattrs[i].config); @@ -496,7 +496,7 @@ get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters) return path.next; } -void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters) +void read_tracing_data(struct perf_event_attr *pattrs, int nb_events) { char buf[BUFSIZ]; struct tracepoint_path *tps; @@ -530,7 +530,7 @@ void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters) page_size = getpagesize(); write_or_die(&page_size, 4); - tps = get_tracepoints_path(pattrs, nb_counters); + tps = get_tracepoints_path(pattrs, nb_events); read_header_files(); read_ftrace_files(tps); diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index d35ebf1e29f..693f815c942 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -240,6 +240,6 @@ unsigned long long raw_field_value(struct event *event, const char *name, void *data); void *raw_field_ptr(struct event *event, const char *name, void *data); -void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters); +void read_tracing_data(struct perf_event_attr *pattrs, int nb_events); #endif /* _TRACE_EVENTS_H */ -- cgit v1.2.3-70-g09d2 From 57c0c15b5244320065374ad2c54f4fbec77a6428 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 12:20:38 +0200 Subject: perf: Tidy up after the big rename - provide compatibility Kconfig entry for existing PERF_COUNTERS .config's - provide courtesy copy of old perf_counter.h, for user-space projects - small indentation fixups - fix up MAINTAINERS - fix small x86 printout fallout - fix up small PowerPC comment fallout (use 'counter' as in register) Reviewed-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- MAINTAINERS | 2 +- arch/powerpc/include/asm/paca.h | 2 +- arch/powerpc/kernel/perf_event.c | 12 +- arch/x86/kernel/cpu/perf_event.c | 14 +- include/linux/perf_counter.h | 441 +++++++++++++++++++++++++++++++++++++++ include/linux/perf_event.h | 98 ++++----- init/Kconfig | 37 +++- kernel/perf_event.c | 4 +- 8 files changed, 534 insertions(+), 76 deletions(-) create mode 100644 include/linux/perf_counter.h (limited to 'arch/powerpc/include/asm') diff --git a/MAINTAINERS b/MAINTAINERS index 43761a00e3f..751a307dc44 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4000,7 +4000,7 @@ S: Maintained F: include/linux/delayacct.h F: kernel/delayacct.c -PERFORMANCE COUNTER SUBSYSTEM +PERFORMANCE EVENTS SUBSYSTEM M: Peter Zijlstra M: Paul Mackerras M: Ingo Molnar diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 154f405b642..7d8514cecea 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -122,7 +122,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ - u8 perf_event_pending; /* PM interrupt while soft-disabled */ + u8 perf_event_pending; /* PM interrupt while soft-disabled */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index c98321fcb45..197b7d95879 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -41,7 +41,7 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); struct power_pmu *ppmu; /* - * Normally, to ignore kernel events we set the FCS (freeze events + * Normally, to ignore kernel events we set the FCS (freeze counters * in supervisor mode) bit in MMCR0, but if the kernel runs with the * hypervisor bit set in the MSR, or if we are running on a processor * where the hypervisor bit is forced to 1 (as on Apple G5 processors), @@ -159,7 +159,7 @@ void perf_event_print_debug(void) } /* - * Read one performance monitor event (PMC). + * Read one performance monitor counter (PMC). */ static unsigned long read_pmc(int idx) { @@ -409,7 +409,7 @@ static void power_pmu_read(struct perf_event *event) val = read_pmc(event->hw.idx); } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); - /* The events are only 32 bits wide */ + /* The counters are only 32 bits wide */ delta = (val - prev) & 0xfffffffful; atomic64_add(delta, &event->count); atomic64_sub(delta, &event->hw.period_left); @@ -543,7 +543,7 @@ void hw_perf_disable(void) } /* - * Set the 'freeze events' bit. + * Set the 'freeze counters' bit. * The barrier is to make sure the mtspr has been * executed and the PMU has frozen the events * before we return. @@ -1124,7 +1124,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) } /* - * A event has overflowed; update its count and record + * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled * here so there is no possibility of being interrupted. */ @@ -1271,7 +1271,7 @@ static void perf_event_interrupt(struct pt_regs *regs) /* * Reset MMCR0 to its normal value. This will set PMXE and - * clear FC (freeze events) and PMAO (perf mon alert occurred) + * clear FC (freeze counters) and PMAO (perf mon alert occurred) * and thus allow interrupts to occur again. * XXX might want to use MSR.PM to keep the events frozen until * we get back out of this interrupt. diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0d03629fb1a..a3c7adb06b7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2081,13 +2081,13 @@ void __init init_hw_perf_events(void) perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.event_bits); - pr_info("... generic events: %d\n", x86_pmu.num_events); - pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); - pr_info("... event mask: %016Lx\n", perf_event_mask); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); } static inline void x86_pmu_read(struct perf_event *event) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 00000000000..368bd70f1d2 --- /dev/null +++ b/include/linux/perf_counter.h @@ -0,0 +1,441 @@ +/* + * NOTE: this file will be removed in a future kernel release, it is + * provided as a courtesy copy of user-space code that relies on the + * old (pre-rename) symbols and constants. + * + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * Generalized performance counter event types, used by the + * attr.event_id parameter of the sys_perf_counter_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache counters: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_counter_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ +}; + +/* + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_counter_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + + __reserved_1 : 49; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + __u32 __reserved_2; + + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) + +enum perf_counter_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw counters in user-space. + * + * u32 seq; + * s64 count; + * + * do { + * seq = pc->lock; + * + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ + __u64 time_enabled; /* time counter active */ + __u64 time_running; /* time counter on cpu */ + + /* + * Hole for extension of the self monitor capabilities + */ + + __u64 __reserved[123]; /* align to 1k */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an rmb(), on + * SMP capable platforms, after reading this value -- see + * perf_counter_wakeup(). + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data. In this case + * the kernel will not over-write unread data. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ +}; + +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +enum perf_event_type { + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ + PERF_EVENT_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * }; + */ + PERF_EVENT_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_EVENT_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * }; + */ + PERF_EVENT_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * }; + */ + PERF_EVENT_THROTTLE = 5, + PERF_EVENT_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * { u64 time; } && PERF_SAMPLE_TIME + * }; + */ + PERF_EVENT_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * }; + */ + PERF_EVENT_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * }; + */ + PERF_EVENT_SAMPLE = 9, + + PERF_EVENT_MAX, /* non-ABI */ +}; + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + +/* + * In case some app still references the old symbols: + */ + +#define __NR_perf_counter_open __NR_perf_event_open + +#define PR_TASK_PERF_COUNTERS_DISABLE PR_TASK_PERF_EVENTS_DISABLE +#define PR_TASK_PERF_COUNTERS_ENABLE PR_TASK_PERF_EVENTS_ENABLE + +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ae9d9ed6df2..acefaf71e6d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1,15 +1,15 @@ /* - * Performance events: + * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra * - * Data type definitions, declarations, prototypes. + * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * - * For licencing details see kernel-base/COPYING + * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H @@ -131,19 +131,19 @@ enum perf_event_sample_format { * as specified by attr.read_format: * * struct read_format { - * { u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 id; } && PERF_FORMAT_ID - * } && !PERF_FORMAT_GROUP + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP * - * { u64 nr; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 value; - * { u64 id; } && PERF_FORMAT_ID - * } cntr[nr]; - * } && PERF_FORMAT_GROUP + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP * }; */ enum perf_event_read_format { @@ -152,7 +152,7 @@ enum perf_event_read_format { PERF_FORMAT_ID = 1U << 2, PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -216,8 +216,8 @@ struct perf_event_attr { * Ioctls that can be done on a perf event fd: */ #define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) #define PERF_EVENT_IOC_RESET _IO ('$', 3) #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, u64) #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) @@ -314,9 +314,9 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; - * u64 id; - * u64 lost; + * struct perf_event_header header; + * u64 id; + * u64 lost; * }; */ PERF_RECORD_LOST = 2, @@ -383,23 +383,23 @@ enum perf_event_type { * { u64 id; } && PERF_SAMPLE_ID * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID * { u32 cpu, res; } && PERF_SAMPLE_CPU - * { u64 period; } && PERF_SAMPLE_PERIOD + * { u64 period; } && PERF_SAMPLE_PERIOD * * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN * - * # - * # The RAW record below is opaque data wrt the ABI - * # - * # That is, the ABI doesn't make any promises wrt to - * # the stability of its content, it may vary depending - * # on event_id, hardware, kernel version and phase of - * # the moon. - * # - * # In other words, PERF_SAMPLE_RAW contents are not an ABI. - * # + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW @@ -503,10 +503,10 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, - PERF_EVENT_STATE_ACTIVE = 1, + PERF_EVENT_STATE_ACTIVE = 1, }; struct file; @@ -529,7 +529,7 @@ struct perf_mmap_data { long watermark; /* wakeup watermark */ - struct perf_event_mmap_page *user_page; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; @@ -694,14 +694,14 @@ struct perf_cpu_context { }; struct perf_output_handle { - struct perf_event *event; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; + struct perf_event *event; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int sample; + int locked; + unsigned long flags; }; #ifdef CONFIG_PERF_EVENTS @@ -829,22 +829,22 @@ static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { } static inline void -perf_event_task_tick(struct task_struct *task, int cpu) { } +perf_event_task_tick(struct task_struct *task, int cpu) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } -static inline void perf_event_do_pending(void) { } -static inline void perf_event_print_debug(void) { } +static inline void perf_event_do_pending(void) { } +static inline void perf_event_print_debug(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } -static inline int perf_event_task_disable(void) { return -EINVAL; } -static inline int perf_event_task_enable(void) { return -EINVAL; } +static inline int perf_event_task_disable(void) { return -EINVAL; } +static inline int perf_event_task_enable(void) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } -static inline void perf_event_mmap(struct vm_area_struct *vma) { } +static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } diff --git a/init/Kconfig b/init/Kconfig index cfdf5c32280..706728be312 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -920,26 +920,31 @@ config HAVE_PERF_EVENTS help See tools/perf/design.txt for details. -menu "Performance Counters" +menu "Kernel Performance Events And Counters" config PERF_EVENTS - bool "Kernel Performance Counters" - default y if PROFILING + bool "Kernel performance events and counters" + default y if (PROFILING || PERF_COUNTERS) depends on HAVE_PERF_EVENTS select ANON_INODES help - Enable kernel support for performance counter hardware. + Enable kernel support for various performance events provided + by software and hardware. - Performance counters are special hardware registers available - on most modern CPUs. These registers count the number of certain + Software events are supported either build-in or via the + use of generic tracepoints. + + Most modern CPUs support performance events via performance + counter registers. These registers count the number of certain types of hw events: such as instructions executed, cachemisses suffered, or branches mis-predicted - without slowing down the kernel or applications. These registers can also trigger interrupts when a threshold number of events have passed - and can thus be used to profile the code that runs on that CPU. - The Linux Performance Counter subsystem provides an abstraction of - these hardware capabilities, available via a system call. It + The Linux Performance Event subsystem provides an abstraction of + these software and hardware cevent apabilities, available via a + system call and used by the "perf" utility in tools/perf/. It provides per task and per CPU counters, and it provides event capabilities on top of those. @@ -950,14 +955,26 @@ config EVENT_PROFILE depends on PERF_EVENTS && EVENT_TRACING default y help - Allow the use of tracepoints as software performance counters. + Allow the use of tracepoints as software performance events. - When this is enabled, you can create perf counters based on + When this is enabled, you can create perf events based on tracepoints using PERF_TYPE_TRACEPOINT and the tracepoint ID found in debugfs://tracing/events/*/*/id. (The -e/--events option to the perf tool can parse and interpret symbolic tracepoints, in the subsystem:tracepoint_name format.) +config PERF_COUNTERS + bool "Kernel performance counters (old config option)" + depends on HAVE_PERF_EVENTS + help + This config has been obsoleted by the PERF_EVENTS + config option - please see that one for details. + + It has no effect on the kernel whether you enable + it or not, it is a compatibility placeholder. + + Say N if unsure. + endmenu config VM_EVENT_COUNTERS diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6e8b99a04e1..76ac4db405e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1,12 +1,12 @@ /* - * Performance event core code + * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * - * For licensing details see kernel-base/COPYING + * For licensing details see kernel-base/COPYING */ #include -- cgit v1.2.3-70-g09d2 From a8f90e906783f1f815120eefe813b23cb396e9bd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 22 Sep 2009 09:48:08 +1000 Subject: perf_event, powerpc: Fix compilation after big perf_counter rename This fixes two places in the powerpc perf_event (perf_counter) code where 'list_entry' needs to be changed to 'group_entry', but were missed in commit 65abc865 ("perf_counter: Rename list_entry -> group_entry, counter_list -> group_list"). This also changes 'event' back to 'counter' in a couple of contexts: * Field and function names that deal with the limited-function counters: it's really the hardware counters whose function is limited, not the events that they count. Hence: MAX_LIMITED_HWEVENTS -> MAX_LIMITED_HWCOUNTERS limited_event -> limited_counter freeze/thaw_limited_events -> freeze/thaw_limited_counters * The machine-specific PMU description struct (struct power_pmu): this renames 'n_event' back to 'n_counter' since it really describes how many hardware counters the machine has. (Renaming this back avoids a compile error in each of the machine-specific PMU back-ends where they initialize their power_pmu struct.) Signed-off-by: Paul Mackerras Cc: linuxppc-dev@ozlabs.org Cc: Peter Zijlstra LKML-Reference: <19128.4280.813369.589704@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_event.h | 4 ++-- arch/powerpc/kernel/perf_event.c | 38 +++++++++++++++++------------------ 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index 2499aaadaeb..3288ce3997e 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -14,7 +14,7 @@ #define MAX_HWEVENTS 8 #define MAX_EVENT_ALTERNATIVES 8 -#define MAX_LIMITED_HWEVENTS 2 +#define MAX_LIMITED_HWCOUNTERS 2 /* * This struct provides the constants and functions needed to @@ -22,7 +22,7 @@ */ struct power_pmu { const char *name; - int n_event; + int n_counter; int max_alternatives; unsigned long add_fields; unsigned long test_adder; diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 197b7d95879..bbcbae183e9 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -30,8 +30,8 @@ struct cpu_hw_events { u64 events[MAX_HWEVENTS]; unsigned int flags[MAX_HWEVENTS]; unsigned long mmcr[3]; - struct perf_event *limited_event[MAX_LIMITED_HWEVENTS]; - u8 limited_hwidx[MAX_LIMITED_HWEVENTS]; + struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS]; + u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS]; u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; @@ -253,7 +253,7 @@ static int power_check_constraints(struct cpu_hw_events *cpuhw, unsigned long addf = ppmu->add_fields; unsigned long tadd = ppmu->test_adder; - if (n_ev > ppmu->n_event) + if (n_ev > ppmu->n_counter) return -1; /* First see if the events will go on as-is */ @@ -426,7 +426,7 @@ static int is_limited_pmc(int pmcnum) && (pmcnum == 5 || pmcnum == 6); } -static void freeze_limited_events(struct cpu_hw_events *cpuhw, +static void freeze_limited_counters(struct cpu_hw_events *cpuhw, unsigned long pmc5, unsigned long pmc6) { struct perf_event *event; @@ -434,7 +434,7 @@ static void freeze_limited_events(struct cpu_hw_events *cpuhw, int i; for (i = 0; i < cpuhw->n_limited; ++i) { - event = cpuhw->limited_event[i]; + event = cpuhw->limited_counter[i]; if (!event->hw.idx) continue; val = (event->hw.idx == 5) ? pmc5 : pmc6; @@ -445,7 +445,7 @@ static void freeze_limited_events(struct cpu_hw_events *cpuhw, } } -static void thaw_limited_events(struct cpu_hw_events *cpuhw, +static void thaw_limited_counters(struct cpu_hw_events *cpuhw, unsigned long pmc5, unsigned long pmc6) { struct perf_event *event; @@ -453,7 +453,7 @@ static void thaw_limited_events(struct cpu_hw_events *cpuhw, int i; for (i = 0; i < cpuhw->n_limited; ++i) { - event = cpuhw->limited_event[i]; + event = cpuhw->limited_counter[i]; event->hw.idx = cpuhw->limited_hwidx[i]; val = (event->hw.idx == 5) ? pmc5 : pmc6; atomic64_set(&event->hw.prev_count, val); @@ -495,9 +495,9 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) "i" (SPRN_PMC5), "i" (SPRN_PMC6)); if (mmcr0 & MMCR0_FC) - freeze_limited_events(cpuhw, pmc5, pmc6); + freeze_limited_counters(cpuhw, pmc5, pmc6); else - thaw_limited_events(cpuhw, pmc5, pmc6); + thaw_limited_counters(cpuhw, pmc5, pmc6); /* * Write the full MMCR0 including the event overflow interrupt @@ -653,7 +653,7 @@ void hw_perf_enable(void) continue; idx = hwc_index[i] + 1; if (is_limited_pmc(idx)) { - cpuhw->limited_event[n_lim] = event; + cpuhw->limited_counter[n_lim] = event; cpuhw->limited_hwidx[n_lim] = idx; ++n_lim; continue; @@ -702,7 +702,7 @@ static int collect_events(struct perf_event *group, int max_count, flags[n] = group->hw.event_base; events[n++] = group->hw.config; } - list_for_each_entry(event, &group->sibling_list, list_entry) { + list_for_each_entry(event, &group->sibling_list, group_entry) { if (!is_software_event(event) && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) @@ -742,7 +742,7 @@ int hw_perf_group_sched_in(struct perf_event *group_leader, return 0; cpuhw = &__get_cpu_var(cpu_hw_events); n0 = cpuhw->n_events; - n = collect_events(group_leader, ppmu->n_event - n0, + n = collect_events(group_leader, ppmu->n_counter - n0, &cpuhw->event[n0], &cpuhw->events[n0], &cpuhw->flags[n0]); if (n < 0) @@ -764,7 +764,7 @@ int hw_perf_group_sched_in(struct perf_event *group_leader, cpuctx->active_oncpu += n; n = 1; event_sched_in(group_leader, cpu); - list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { + list_for_each_entry(sub, &group_leader->sibling_list, group_entry) { if (sub->state != PERF_EVENT_STATE_OFF) { event_sched_in(sub, cpu); ++n; @@ -797,7 +797,7 @@ static int power_pmu_enable(struct perf_event *event) */ cpuhw = &__get_cpu_var(cpu_hw_events); n0 = cpuhw->n_events; - if (n0 >= ppmu->n_event) + if (n0 >= ppmu->n_counter) goto out; cpuhw->event[n0] = event; cpuhw->events[n0] = event->hw.config; @@ -848,11 +848,11 @@ static void power_pmu_disable(struct perf_event *event) } } for (i = 0; i < cpuhw->n_limited; ++i) - if (event == cpuhw->limited_event[i]) + if (event == cpuhw->limited_counter[i]) break; if (i < cpuhw->n_limited) { while (++i < cpuhw->n_limited) { - cpuhw->limited_event[i-1] = cpuhw->limited_event[i]; + cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; } --cpuhw->n_limited; @@ -1078,7 +1078,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) */ n = 0; if (event->group_leader != event) { - n = collect_events(event->group_leader, ppmu->n_event - 1, + n = collect_events(event->group_leader, ppmu->n_counter - 1, ctrs, events, cflags); if (n < 0) return ERR_PTR(-EINVAL); @@ -1230,7 +1230,7 @@ static void perf_event_interrupt(struct pt_regs *regs) int nmi; if (cpuhw->n_limited) - freeze_limited_events(cpuhw, mfspr(SPRN_PMC5), + freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), mfspr(SPRN_PMC6)); perf_read_regs(regs); @@ -1260,7 +1260,7 @@ static void perf_event_interrupt(struct pt_regs *regs) * Any that we processed in the previous loop will not be negative. */ if (!found) { - for (i = 0; i < ppmu->n_event; ++i) { + for (i = 0; i < ppmu->n_counter; ++i) { if (is_limited_pmc(i + 1)) continue; val = read_pmc(i + 1); -- cgit v1.2.3-70-g09d2 From 90f72aa58bbf076b68e289fbd71eb829bc505923 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 21 Sep 2009 17:03:45 -0700 Subject: mm: add MAP_HUGETLB for mmaping pseudo-anonymous huge page regions Add a flag for mmap that will be used to request a huge page region that will look like anonymous memory to user space. This is accomplished by using a file on the internal vfsmount. MAP_HUGETLB is a modifier of MAP_ANONYMOUS and so must be specified with it. The region will behave the same as a MAP_ANONYMOUS region using small pages. The patch also adds the MAP_STACK flag, which was previously defined only on some architectures but not on others. Since MAP_STACK is meant to be a hint only, architectures can define it without assigning a specific meaning to it. Signed-off-by: Arnd Bergmann Cc: Eric B Munson Cc: Hugh Dickins Cc: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/mman.h | 2 ++ arch/arm/include/asm/mman.h | 2 ++ arch/avr32/include/asm/mman.h | 2 ++ arch/cris/include/asm/mman.h | 2 ++ arch/frv/include/asm/mman.h | 2 ++ arch/h8300/include/asm/mman.h | 2 ++ arch/ia64/include/asm/mman.h | 2 ++ arch/m32r/include/asm/mman.h | 2 ++ arch/m68k/include/asm/mman.h | 2 ++ arch/mips/include/asm/mman.h | 2 ++ arch/mn10300/include/asm/mman.h | 2 ++ arch/parisc/include/asm/mman.h | 2 ++ arch/powerpc/include/asm/mman.h | 2 ++ arch/s390/include/asm/mman.h | 2 ++ arch/sparc/include/asm/mman.h | 2 ++ arch/xtensa/include/asm/mman.h | 2 ++ include/asm-generic/mman.h | 1 + 17 files changed, 33 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h index c77c55756a7..99c56d47879 100644 --- a/arch/alpha/include/asm/mman.h +++ b/arch/alpha/include/asm/mman.h @@ -28,6 +28,8 @@ #define MAP_NORESERVE 0x10000 /* don't check for reservations */ #define MAP_POPULATE 0x20000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x40000 /* do not block on IO */ +#define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x100000 /* create a huge page mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/arm/include/asm/mman.h b/arch/arm/include/asm/mman.h index fc26976d8e3..6464d471bc7 100644 --- a/arch/arm/include/asm/mman.h +++ b/arch/arm/include/asm/mman.h @@ -10,6 +10,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) page tables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/avr32/include/asm/mman.h b/arch/avr32/include/asm/mman.h index 9a92b15f6a6..38cea1b597c 100644 --- a/arch/avr32/include/asm/mman.h +++ b/arch/avr32/include/asm/mman.h @@ -10,6 +10,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) page tables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/cris/include/asm/mman.h b/arch/cris/include/asm/mman.h index b7f0afba3ce..de6b903b22c 100644 --- a/arch/cris/include/asm/mman.h +++ b/arch/cris/include/asm/mman.h @@ -12,6 +12,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/frv/include/asm/mman.h b/arch/frv/include/asm/mman.h index 58c1d11e2ac..1939343322b 100644 --- a/arch/frv/include/asm/mman.h +++ b/arch/frv/include/asm/mman.h @@ -10,6 +10,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/h8300/include/asm/mman.h b/arch/h8300/include/asm/mman.h index cf35f0a6f12..eacacd04032 100644 --- a/arch/h8300/include/asm/mman.h +++ b/arch/h8300/include/asm/mman.h @@ -10,6 +10,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/ia64/include/asm/mman.h b/arch/ia64/include/asm/mman.h index 48cf8b98a0b..cf55884e7f3 100644 --- a/arch/ia64/include/asm/mman.h +++ b/arch/ia64/include/asm/mman.h @@ -18,6 +18,8 @@ #define MAP_NORESERVE 0x04000 /* don't check for reservations */ #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/m32r/include/asm/mman.h b/arch/m32r/include/asm/mman.h index 04a5f40aa40..d191089808f 100644 --- a/arch/m32r/include/asm/mman.h +++ b/arch/m32r/include/asm/mman.h @@ -10,6 +10,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/m68k/include/asm/mman.h b/arch/m68k/include/asm/mman.h index 9f5c4c4b3c7..c421fef55f5 100644 --- a/arch/m68k/include/asm/mman.h +++ b/arch/m68k/include/asm/mman.h @@ -10,6 +10,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h index f15554d1518..a2250f390a2 100644 --- a/arch/mips/include/asm/mman.h +++ b/arch/mips/include/asm/mman.h @@ -46,6 +46,8 @@ #define MAP_LOCKED 0x8000 /* pages are locked */ #define MAP_POPULATE 0x10000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ +#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x80000 /* create a huge page mapping */ /* * Flags for msync diff --git a/arch/mn10300/include/asm/mman.h b/arch/mn10300/include/asm/mman.h index d04fac1da5a..94611c356bb 100644 --- a/arch/mn10300/include/asm/mman.h +++ b/arch/mn10300/include/asm/mman.h @@ -21,6 +21,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h index a12d9d43f50..9749c8afe83 100644 --- a/arch/parisc/include/asm/mman.h +++ b/arch/parisc/include/asm/mman.h @@ -22,6 +22,8 @@ #define MAP_GROWSDOWN 0x8000 /* stack-like segment */ #define MAP_POPULATE 0x10000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ +#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x80000 /* create a huge page mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index 7b1c49811a2..d4a7f645c5d 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -25,6 +25,8 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #ifdef __KERNEL__ #ifdef CONFIG_PPC64 diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h index f63fe7b431e..22714ca181a 100644 --- a/arch/s390/include/asm/mman.h +++ b/arch/s390/include/asm/mman.h @@ -18,6 +18,8 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/arch/sparc/include/asm/mman.h b/arch/sparc/include/asm/mman.h index 988192e8e95..c3029ad6619 100644 --- a/arch/sparc/include/asm/mman.h +++ b/arch/sparc/include/asm/mman.h @@ -20,6 +20,8 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h index 6e55b4d1f9c..fca4db425f6 100644 --- a/arch/xtensa/include/asm/mman.h +++ b/arch/xtensa/include/asm/mman.h @@ -53,6 +53,8 @@ #define MAP_LOCKED 0x8000 /* pages are locked */ #define MAP_POPULATE 0x10000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ +#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x80000 /* create a huge page mapping */ /* * Flags for msync diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h index 7cab4de2bca..32c8bd6a196 100644 --- a/include/asm-generic/mman.h +++ b/include/asm-generic/mman.h @@ -11,6 +11,7 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ -- cgit v1.2.3-70-g09d2 From b966cd6b285d4cd6feaf8b06b21bc87adb907929 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:25 -0600 Subject: cpumask: remove the now-obsoleted pcibus_to_cpumask(): powerpc cpumask_of_pcibus() is the new version. Signed-off-by: Rusty Russell --- arch/powerpc/include/asm/topology.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 394edcbcce7..9a3300d6a27 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -36,11 +36,6 @@ static inline int pcibus_to_node(struct pci_bus *bus) } #endif -#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \ - CPU_MASK_ALL : \ - node_to_cpumask(pcibus_to_node(bus)) \ - ) - #define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) -- cgit v1.2.3-70-g09d2 From 29c337a034b5526e80a785409d15d3b7c7edecf4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:26 -0600 Subject: cpumask: remove obsolete node_to_cpumask now everyone uses cpumask_of_node Signed-off-by: Rusty Russell --- arch/alpha/include/asm/topology.h | 17 ----------------- arch/ia64/include/asm/topology.h | 1 - arch/mips/include/asm/mach-ip27/topology.h | 1 - arch/mips/sgi-ip27/ip27-memory.c | 2 +- arch/powerpc/include/asm/topology.h | 5 ----- arch/sh/include/asm/topology.h | 1 - arch/sparc/include/asm/topology_64.h | 14 -------------- include/asm-generic/topology.h | 17 ----------------- 8 files changed, 1 insertion(+), 57 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h index f5bd6cd4b3b..36b3a30ba0e 100644 --- a/arch/alpha/include/asm/topology.h +++ b/arch/alpha/include/asm/topology.h @@ -22,23 +22,6 @@ static inline int cpu_to_node(int cpu) return node; } -static inline cpumask_t node_to_cpumask(int node) -{ - cpumask_t node_cpu_mask = CPU_MASK_NONE; - int cpu; - - for_each_online_cpu(cpu) { - if (cpu_to_node(cpu) == node) - cpu_set(cpu, node_cpu_mask); - } - -#ifdef DEBUG_NUMA - printk("node %d: cpu_mask: %016lx\n", node, node_cpu_mask); -#endif - - return node_cpu_mask; -} - extern struct cpumask node_to_cpumask_map[]; /* FIXME: This is dumb, recalculating every time. But simple. */ static const struct cpumask *cpumask_of_node(int node) diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index d0141fbf51d..e85da7f1db5 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -33,7 +33,6 @@ /* * Returns a bitmask of CPUs on Node 'node'. */ -#define node_to_cpumask(node) (node_to_cpu_mask[node]) #define cpumask_of_node(node) (&node_to_cpu_mask[node]) /* diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index 697244a7d39..f6837422fe6 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -24,7 +24,6 @@ extern struct cpuinfo_ip27 sn_cpu_info[NR_CPUS]; #define cpu_to_node(cpu) (sn_cpu_info[(cpu)].p_nodeid) #define parent_node(node) (node) -#define node_to_cpumask(node) (hub_data(node)->h_cpus) #define cpumask_of_node(node) (&hub_data(node)->h_cpus) struct pci_bus; extern int pcibus_to_node(struct pci_bus *); diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 060d853d7b3..f61c164d1e6 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -421,7 +421,7 @@ static void __init node_mem_init(cnodeid_t node) /* * A node with nothing. We use it to avoid any special casing in - * node_to_cpumask + * cpumask_of_node */ static struct node_data null_node = { .hub = { diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 9a3300d6a27..829bf3c9b68 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -17,11 +17,6 @@ static inline int cpu_to_node(int cpu) #define parent_node(node) (node) -static inline cpumask_t node_to_cpumask(int node) -{ - return numa_cpumask_lookup_table[node]; -} - #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node]) int of_node_to_nid(struct device_node *device); diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index f8c40cc6505..65e7bd2f224 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -31,7 +31,6 @@ #define cpu_to_node(cpu) ((void)(cpu),0) #define parent_node(node) ((void)(node),0) -#define node_to_cpumask(node) ((void)node, cpu_online_map) #define cpumask_of_node(node) ((void)node, cpu_online_mask) #define pcibus_to_node(bus) ((void)(bus), -1) diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 26cd25c0839..75752e106f4 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -12,22 +12,8 @@ static inline int cpu_to_node(int cpu) #define parent_node(node) (node) -static inline cpumask_t node_to_cpumask(int node) -{ - return numa_cpumask_lookup_table[node]; -} #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node]) -/* - * Returns a pointer to the cpumask of CPUs on Node 'node'. - * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" - */ -#define node_to_cpumask_ptr(v, node) \ - cpumask_t *v = &(numa_cpumask_lookup_table[node]) - -#define node_to_cpumask_ptr_next(v, node) \ - v = &(numa_cpumask_lookup_table[node]) - struct pci_bus; #ifdef CONFIG_PCI extern int pcibus_to_node(struct pci_bus *pbus); diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 88bada2ebc4..510df36dd5d 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -37,9 +37,6 @@ #ifndef parent_node #define parent_node(node) ((void)(node),0) #endif -#ifndef node_to_cpumask -#define node_to_cpumask(node) ((void)node, cpu_online_map) -#endif #ifndef cpumask_of_node #define cpumask_of_node(node) ((void)node, cpu_online_mask) #endif @@ -55,18 +52,4 @@ #endif /* CONFIG_NUMA */ -/* - * returns pointer to cpumask for specified node - * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" - */ -#ifndef node_to_cpumask_ptr - -#define node_to_cpumask_ptr(v, node) \ - cpumask_t _##v = node_to_cpumask(node); \ - const cpumask_t *v = &_##v - -#define node_to_cpumask_ptr_next(v, node) \ - _##v = node_to_cpumask(node) -#endif - #endif /* _ASM_GENERIC_TOPOLOGY_H */ -- cgit v1.2.3-70-g09d2 From 399d0682704144ddadb27164343a265774d8b301 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:42 -0600 Subject: cpumask: remove obsolete topology_core_siblings and topology_thread_siblings: powerpc There were replaced by topology_core_cpumask and topology_thread_cpumask. Signed-off-by: Rusty Russell --- arch/powerpc/include/asm/topology.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 829bf3c9b68..22f738d12ad 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -94,8 +94,6 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev, #ifdef CONFIG_PPC64 #include -#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) -#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) #define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) -- cgit v1.2.3-70-g09d2 From f063ea02fba5782099b6730d5733ee44638df8f9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:45 -0600 Subject: cpumask: arch_send_call_function_ipi_mask: powerpc We're weaning the core code off handing cpumask's around on-stack. This introduces arch_send_call_function_ipi_mask(), and by defining it, the old arch_send_call_function_ipi is defined by the core code. Signed-off-by: Rusty Russell --- arch/powerpc/include/asm/smp.h | 3 ++- arch/powerpc/kernel/smp.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index c0d3b8af931..1491bfe822d 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -146,7 +146,8 @@ extern void smp_generic_take_timebase(void); extern struct smp_ops_t *smp_ops; extern void arch_send_call_function_single_ipi(int cpu); -extern void arch_send_call_function_ipi(cpumask_t mask); +extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); +#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask /* Definitions relative to the secondary CPU spin loop * and entry point. Not all of them exist on both 32 and diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index d387b3937cc..7f68ceb3bdb 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -189,11 +189,11 @@ void arch_send_call_function_single_ipi(int cpu) smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE); } -void arch_send_call_function_ipi(cpumask_t mask) +void arch_send_call_function_ipi_mask(const struct cpumask *mask) { unsigned int cpu; - for_each_cpu_mask(cpu, mask) + for_each_cpu(cpu, mask) smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNCTION); } -- cgit v1.2.3-70-g09d2 From 0748bd01773395003208996c4c0b3f80caf80976 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:46 -0600 Subject: cpumask: remove arch_send_call_function_ipi Now everyone is converted to arch_send_call_function_ipi_mask, remove the shim and the #defines. Signed-off-by: Rusty Russell --- arch/alpha/include/asm/smp.h | 1 - arch/arm/include/asm/smp.h | 1 - arch/ia64/include/asm/smp.h | 1 - arch/m32r/include/asm/smp.h | 1 - arch/mips/include/asm/smp.h | 1 - arch/parisc/include/asm/smp.h | 1 - arch/powerpc/include/asm/smp.h | 1 - arch/s390/include/asm/smp.h | 1 - arch/sh/include/asm/smp.h | 1 - arch/sparc/include/asm/smp_64.h | 1 - arch/x86/include/asm/smp.h | 1 - kernel/smp.c | 7 ------- 12 files changed, 18 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/alpha/include/asm/smp.h b/arch/alpha/include/asm/smp.h index 8818a1bcdc8..3f390e8cc0b 100644 --- a/arch/alpha/include/asm/smp.h +++ b/arch/alpha/include/asm/smp.h @@ -48,7 +48,6 @@ extern int smp_num_cpus; extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #else /* CONFIG_SMP */ diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h index a06e735b262..e0d763be184 100644 --- a/arch/arm/include/asm/smp.h +++ b/arch/arm/include/asm/smp.h @@ -93,7 +93,6 @@ extern void platform_cpu_enable(unsigned int cpu); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask /* * show local interrupt info diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h index d217d1d4e05..0b3b3997dec 100644 --- a/arch/ia64/include/asm/smp.h +++ b/arch/ia64/include/asm/smp.h @@ -127,7 +127,6 @@ extern int is_multithreading_enabled(void); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #else /* CONFIG_SMP */ diff --git a/arch/m32r/include/asm/smp.h b/arch/m32r/include/asm/smp.h index c2be49d408a..e67ded1aab9 100644 --- a/arch/m32r/include/asm/smp.h +++ b/arch/m32r/include/asm/smp.h @@ -89,7 +89,6 @@ extern unsigned long send_IPI_mask_phys(cpumask_t, int, int); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #endif /* not __ASSEMBLY__ */ diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index 48c1967961a..e15f11a0931 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -79,6 +79,5 @@ extern asmlinkage void smp_call_function_interrupt(void); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #endif /* __ASM_SMP_H */ diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h index 21eb45a5262..2e73623feb6 100644 --- a/arch/parisc/include/asm/smp.h +++ b/arch/parisc/include/asm/smp.h @@ -30,7 +30,6 @@ extern void smp_send_all_nop(void); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #endif /* !ASSEMBLY */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 1491bfe822d..d9ea8d39c34 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -147,7 +147,6 @@ extern struct smp_ops_t *smp_ops; extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask /* Definitions relative to the secondary CPU spin loop * and entry point. Not all of them exist on both 32 and diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 6de62189a48..a868b272c25 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -63,7 +63,6 @@ extern int smp_cpu_polarization[]; extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #endif diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h index ca64f43abe6..53ef26ced75 100644 --- a/arch/sh/include/asm/smp.h +++ b/arch/sh/include/asm/smp.h @@ -44,7 +44,6 @@ void plat_send_ipi(unsigned int cpu, unsigned int message); void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #else diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h index becb6bf353a..f49e11cd4de 100644 --- a/arch/sparc/include/asm/smp_64.h +++ b/arch/sparc/include/asm/smp_64.h @@ -36,7 +36,6 @@ extern int sparc64_multi_core; extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask /* * General functions that each host system must provide. diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 6a84ed166ae..1e796782cd7 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu) smp_ops.send_call_func_single_ipi(cpu); } -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { smp_ops.send_call_func_ipi(mask); diff --git a/kernel/smp.c b/kernel/smp.c index fd47a256a24..c9d1c7835c2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -347,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, generic_exec_single(cpu, data, wait); } -/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ - -#ifndef arch_send_call_function_ipi_mask -# define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) -#endif - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). -- cgit v1.2.3-70-g09d2