diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/amd_iommu.c | 666 | ||||
-rw-r--r-- | arch/x86/kernel/amd_iommu_init.c | 19 | ||||
-rw-r--r-- | arch/x86/kernel/apic.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/bios_uv.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/generic.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/main.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/mtrr.h | 18 | ||||
-rw-r--r-- | arch/x86/kernel/cpuid.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/crash.c | 18 | ||||
-rw-r--r-- | arch/x86/kernel/early_printk.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/genx2apic_phys.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/head64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/init_task.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/io_apic.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/mmconf-fam10h_64.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/mpparse.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/nmi.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/pci-gart_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/reboot.c | 64 | ||||
-rw-r--r-- | arch/x86/kernel/tlb_uv.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 33 | ||||
-rw-r--r-- | arch/x86/kernel/xsave.c | 2 |
24 files changed, 757 insertions, 171 deletions
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 2e2da717b35..5113c080f0c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -20,8 +20,12 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> +#ifdef CONFIG_IOMMU_API +#include <linux/iommu.h> +#endif #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -38,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); +#ifdef CONFIG_IOMMU_API +static struct iommu_ops amd_iommu_ops; +#endif + /* * general struct to manage commands send to an IOMMU */ @@ -47,6 +55,68 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +static struct dma_ops_domain *find_protection_domain(u16 devid); + + +#ifdef CONFIG_AMD_IOMMU_STATS + +/* + * Initialization code for statistics collection + */ + +DECLARE_STATS_COUNTER(compl_wait); +DECLARE_STATS_COUNTER(cnt_map_single); +DECLARE_STATS_COUNTER(cnt_unmap_single); +DECLARE_STATS_COUNTER(cnt_map_sg); +DECLARE_STATS_COUNTER(cnt_unmap_sg); +DECLARE_STATS_COUNTER(cnt_alloc_coherent); +DECLARE_STATS_COUNTER(cnt_free_coherent); +DECLARE_STATS_COUNTER(cross_page); +DECLARE_STATS_COUNTER(domain_flush_single); +DECLARE_STATS_COUNTER(domain_flush_all); +DECLARE_STATS_COUNTER(alloced_io_mem); +DECLARE_STATS_COUNTER(total_map_requests); + +static struct dentry *stats_dir; +static struct dentry *de_isolate; +static struct dentry *de_fflush; + +static void amd_iommu_stats_add(struct __iommu_counter *cnt) +{ + if (stats_dir == NULL) + return; + + cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, + &cnt->value); +} + +static void amd_iommu_stats_init(void) +{ + stats_dir = debugfs_create_dir("amd-iommu", NULL); + if (stats_dir == NULL) + return; + + de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, + (u32 *)&amd_iommu_isolate); + + de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, + (u32 *)&amd_iommu_unmap_flush); + + amd_iommu_stats_add(&compl_wait); + amd_iommu_stats_add(&cnt_map_single); + amd_iommu_stats_add(&cnt_unmap_single); + amd_iommu_stats_add(&cnt_map_sg); + amd_iommu_stats_add(&cnt_unmap_sg); + amd_iommu_stats_add(&cnt_alloc_coherent); + amd_iommu_stats_add(&cnt_free_coherent); + amd_iommu_stats_add(&cross_page); + amd_iommu_stats_add(&domain_flush_single); + amd_iommu_stats_add(&domain_flush_all); + amd_iommu_stats_add(&alloced_io_mem); + amd_iommu_stats_add(&total_map_requests); +} + +#endif /* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) @@ -189,13 +259,55 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command(iommu, cmd); if (!ret) - iommu->need_sync = 1; + iommu->need_sync = true; spin_unlock_irqrestore(&iommu->lock, flags); return ret; } /* + * This function waits until an IOMMU has completed a completion + * wait command + */ +static void __iommu_wait_for_completion(struct amd_iommu *iommu) +{ + int ready = 0; + unsigned status = 0; + unsigned long i = 0; + + INC_STATS_COUNTER(compl_wait); + + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; + /* wait for the bit to become one */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + } + + /* set bit back to zero */ + status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; + writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + + if (unlikely(i == EXIT_LOOP_COUNT)) + panic("AMD IOMMU: Completion wait loop failed\n"); +} + +/* + * This function queues a completion wait command into the command + * buffer of an IOMMU + */ +static int __iommu_completion_wait(struct amd_iommu *iommu) +{ + struct iommu_cmd cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; + CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + + return __iommu_queue_command(iommu, &cmd); +} + +/* * This function is called whenever we need to ensure that the IOMMU has * completed execution of all commands we sent. It sends a * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs @@ -204,40 +316,22 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret = 0, ready = 0; - unsigned status = 0; - struct iommu_cmd cmd; - unsigned long flags, i = 0; - - memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; - CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + int ret = 0; + unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); if (!iommu->need_sync) goto out; - iommu->need_sync = 0; + ret = __iommu_completion_wait(iommu); - ret = __iommu_queue_command(iommu, &cmd); + iommu->need_sync = false; if (ret) goto out; - while (!ready && (i < EXIT_LOOP_COUNT)) { - ++i; - /* wait for the bit to become one */ - status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; - } - - /* set bit back to zero */ - status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; - writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - - if (unlikely(i == EXIT_LOOP_COUNT)) - panic("AMD IOMMU: Completion wait loop failed\n"); + __iommu_wait_for_completion(iommu); out: spin_unlock_irqrestore(&iommu->lock, flags); @@ -264,6 +358,21 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return ret; } +static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, + u16 domid, int pde, int s) +{ + memset(cmd, 0, sizeof(*cmd)); + address &= PAGE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); + cmd->data[1] |= domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + if (s) /* size bit - we flush more than one 4kb page */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + /* * Generic command send function for invalidaing TLB entries */ @@ -273,16 +382,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, struct iommu_cmd cmd; int ret; - memset(&cmd, 0, sizeof(cmd)); - address &= PAGE_MASK; - CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); - cmd.data[1] |= domid; - cmd.data[2] = lower_32_bits(address); - cmd.data[3] = upper_32_bits(address); - if (s) /* size bit - we flush more than one 4kb page */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); ret = iommu_queue_command(iommu, &cmd); @@ -321,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) { u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + INC_STATS_COUNTER(domain_flush_single); + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); } +/* + * This function is used to flush the IO/TLB for a given protection domain + * on every IOMMU in the system + */ +static void iommu_flush_domain(u16 domid) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + INC_STATS_COUNTER(domain_flush_all); + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + list_for_each_entry(iommu, &amd_iommu_list, list) { + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -338,10 +464,10 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) * supporting all features of AMD IOMMU page tables like level skipping * and full 64 bit address spaces. */ -static int iommu_map(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long phys_addr, - int prot) +static int iommu_map_page(struct protection_domain *dom, + unsigned long bus_addr, + unsigned long phys_addr, + int prot) { u64 __pte, *pte, *page; @@ -388,6 +514,28 @@ static int iommu_map(struct protection_domain *dom, return 0; } +static void iommu_unmap_page(struct protection_domain *dom, + unsigned long bus_addr) +{ + u64 *pte; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + *pte = 0; +} + /* * This function checks if a specific unity mapping entry is needed for * this specific IOMMU. @@ -440,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); if (ret) return ret; /* @@ -571,6 +719,16 @@ static u16 domain_id_alloc(void) return id; } +static void domain_id_free(int id) +{ + unsigned long flags; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + if (id > 0 && id < MAX_DOMAIN_ID) + __clear_bit(id, amd_iommu_pd_alloc_bitmap); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + /* * Used to reserve address ranges in the aperture (e.g. for exclusion * ranges. @@ -587,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, iommu_area_reserve(dom->bitmap, start_page, pages); } -static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) +static void free_pagetable(struct protection_domain *domain) { int i, j; u64 *p1, *p2, *p3; - p1 = dma_dom->domain.pt_root; + p1 = domain->pt_root; if (!p1) return; @@ -613,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) } free_page((unsigned long)p1); + + domain->pt_root = NULL; } /* @@ -624,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; - dma_ops_free_pagetable(dom); + free_pagetable(&dom->domain); kfree(dom->pte_pages); @@ -663,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, goto free_dma_dom; dma_dom->domain.mode = PAGE_MODE_3_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); + dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; @@ -725,6 +886,15 @@ free_dma_dom: } /* + * little helper function to check whether a given protection domain is a + * dma_ops domain + */ +static bool dma_ops_domain(struct protection_domain *domain) +{ + return domain->flags & PD_DMA_OPS_MASK; +} + +/* * Find out the protection domain structure for a given PCI device. This * will give us the pointer to the page table root for example. */ @@ -744,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid) * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void set_device_domain(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static void attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); + domain->dev_cnt += 1; + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -767,6 +938,116 @@ static void set_device_domain(struct amd_iommu *iommu, iommu_queue_inv_dev_entry(iommu, devid); } +/* + * Removes a device from a protection domain (unlocked) + */ +static void __detach_device(struct protection_domain *domain, u16 devid) +{ + + /* lock domain */ + spin_lock(&domain->lock); + + /* remove domain from the lookup table */ + amd_iommu_pd_table[devid] = NULL; + + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; + + /* decrease reference counter */ + domain->dev_cnt -= 1; + + /* ready */ + spin_unlock(&domain->lock); +} + +/* + * Removes a device from a protection domain (with devtable_lock held) + */ +static void detach_device(struct protection_domain *domain, u16 devid) +{ + unsigned long flags; + + /* lock device table */ + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + __detach_device(domain, devid); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int device_change_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + struct protection_domain *domain; + struct dma_ops_domain *dma_domain; + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + unsigned long flags; + + if (devid > amd_iommu_last_bdf) + goto out; + + devid = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + goto out; + + domain = domain_for_device(devid); + + if (domain && !dma_ops_domain(domain)) + WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " + "to a non-dma-ops domain\n", dev_name(dev)); + + switch (action) { + case BUS_NOTIFY_BOUND_DRIVER: + if (domain) + goto out; + dma_domain = find_protection_domain(devid); + if (!dma_domain) + dma_domain = iommu->default_dom; + attach_device(iommu, &dma_domain->domain, devid); + printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " + "device %s\n", dma_domain->domain.id, dev_name(dev)); + break; + case BUS_NOTIFY_UNBIND_DRIVER: + if (!domain) + goto out; + detach_device(domain, devid); + break; + case BUS_NOTIFY_ADD_DEVICE: + /* allocate a protection domain if a device is added */ + dma_domain = find_protection_domain(devid); + if (dma_domain) + goto out; + dma_domain = dma_ops_domain_alloc(iommu, order); + if (!dma_domain) + goto out; + dma_domain->target_dev = devid; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + list_add_tail(&dma_domain->list, &iommu_pd_list); + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + break; + default: + goto out; + } + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); + +out: + return 0; +} + +struct notifier_block device_nb = { + .notifier_call = device_change_notifier, +}; + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -802,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) list_for_each_entry(entry, &iommu_pd_list, list) { if (entry->target_dev == devid) { ret = entry; - list_del(&ret->list); break; } } @@ -853,14 +1133,13 @@ static int get_device_resources(struct device *dev, if (!dma_dom) dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; - set_device_domain(*iommu, *domain, *bdf); + attach_device(*iommu, *domain, *bdf); printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device ", (*domain)->id); - print_devid(_bdf, 1); + "device %s\n", (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) - set_device_domain(*iommu, *domain, _bdf); + attach_device(*iommu, *domain, _bdf); return 1; } @@ -946,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev, pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; + INC_STATS_COUNTER(total_map_requests); + + if (pages > 1) + INC_STATS_COUNTER(cross_page); + if (align) align_mask = (1UL << get_order(size)) - 1; @@ -962,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; + ADD_STATS_COUNTER(alloced_io_mem, size); + if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { iommu_flush_tlb(iommu, dma_dom->domain.id); dma_dom->need_flush = false; @@ -998,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu, start += PAGE_SIZE; } + SUB_STATS_COUNTER(alloced_io_mem, size); + dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { @@ -1019,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, dma_addr_t addr; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_single); + if (!check_device(dev)) return bad_dma_address; @@ -1030,6 +1320,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; + if (!dma_ops_domain(domain)) + return bad_dma_address; + spin_lock_irqsave(&domain->lock, flags); addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, dma_mask); @@ -1055,11 +1348,16 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_unmap_single); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, dir); @@ -1104,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, int mapped_elems = 0; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_sg); + if (!check_device(dev)) return 0; @@ -1114,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, if (!iommu || !domain) return map_sg_no_iommu(dev, sglist, nelems, dir); + if (!dma_ops_domain(domain)) + return 0; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1163,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, u16 devid; int i; + INC_STATS_COUNTER(cnt_unmap_sg); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1194,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size, phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; + INC_STATS_COUNTER(cnt_alloc_coherent); + if (!check_device(dev)) return NULL; @@ -1212,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size, return virt_addr; } + if (!dma_ops_domain(domain)) + goto out_free; + if (!dma_mask) dma_mask = *dev->dma_mask; @@ -1220,18 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { - free_pages((unsigned long)virt_addr, get_order(size)); - virt_addr = NULL; - goto out; - } + if (*dma_addr == bad_dma_address) + goto out_free; iommu_completion_wait(iommu); -out: spin_unlock_irqrestore(&domain->lock, flags); return virt_addr; + +out_free: + + free_pages((unsigned long)virt_addr, get_order(size)); + + return NULL; } /* @@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_free_coherent); + if (!check_device(dev)) return; @@ -1253,6 +1570,9 @@ static void free_coherent(struct device *dev, size_t size, if (!iommu || !domain) goto free_mem; + if (!dma_ops_domain(domain)) + goto free_mem; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); @@ -1296,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) * we don't need to preallocate the protection domains anymore. * For now we have to. */ -void prealloc_protection_domains(void) +static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; @@ -1305,7 +1625,7 @@ void prealloc_protection_domains(void) u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = (dev->bus->number << 8) | dev->devfn; + devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -1352,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void) iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) return -ENOMEM; + iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; ret = iommu_init_unity_mappings(iommu); if (ret) goto free_domains; @@ -1375,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; + register_iommu(&amd_iommu_ops); + + bus_register_notifier(&pci_bus_type, &device_nb); + + amd_iommu_stats_init(); + return 0; free_domains: @@ -1386,3 +1713,224 @@ free_domains: return ret; } + +/***************************************************************************** + * + * The following functions belong to the exported interface of AMD IOMMU + * + * This interface allows access to lower level functions of the IOMMU + * like protection domain handling and assignement of devices to domains + * which is not possible with the dma_ops interface. + * + *****************************************************************************/ + +static void cleanup_domain(struct protection_domain *domain) +{ + unsigned long flags; + u16 devid; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + + for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) + if (amd_iommu_pd_table[devid] == domain) + __detach_device(domain, devid); + + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + spin_lock_init(&domain->lock); + domain->mode = PAGE_MODE_3_LEVEL; + domain->id = domain_id_alloc(); + if (!domain->id) + goto out_free; + domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); + if (!domain->pt_root) + goto out_free; + + dom->priv = domain; + + return 0; + +out_free: + kfree(domain); + + return -ENOMEM; +} + +static void amd_iommu_domain_destroy(struct iommu_domain *dom) +{ + struct protection_domain *domain = dom->priv; + + if (!domain) + return; + + if (domain->dev_cnt > 0) + cleanup_domain(domain); + + BUG_ON(domain->dev_cnt != 0); + + free_pagetable(domain); + + domain_id_free(domain->id); + + kfree(domain); + + dom->priv = NULL; +} + +static void amd_iommu_detach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid > 0) + detach_device(domain, devid); + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return; + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); +} + +static int amd_iommu_attach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct protection_domain *old_domain; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return -EINVAL; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid >= amd_iommu_last_bdf || + devid != amd_iommu_alias_table[devid]) + return -EINVAL; + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return -EINVAL; + + old_domain = domain_for_device(devid); + if (old_domain) + return -EBUSY; + + attach_device(iommu, domain, devid); + + iommu_completion_wait(iommu); + + return 0; +} + +static int amd_iommu_map_range(struct iommu_domain *dom, + unsigned long iova, phys_addr_t paddr, + size_t size, int iommu_prot) +{ + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE); + int prot = 0; + int ret; + + if (iommu_prot & IOMMU_READ) + prot |= IOMMU_PROT_IR; + if (iommu_prot & IOMMU_WRITE) + prot |= IOMMU_PROT_IW; + + iova &= PAGE_MASK; + paddr &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + ret = iommu_map_page(domain, iova, paddr, prot); + if (ret) + return ret; + + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + } + + return 0; +} + +static void amd_iommu_unmap_range(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); + + iova &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + iommu_unmap_page(domain, iova); + iova += PAGE_SIZE; + } + + iommu_flush_domain(domain->id); +} + +static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, + unsigned long iova) +{ + struct protection_domain *domain = dom->priv; + unsigned long offset = iova & ~PAGE_MASK; + phys_addr_t paddr; + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + paddr = *pte & IOMMU_PAGE_MASK; + paddr |= offset; + + return paddr; +} + +static struct iommu_ops amd_iommu_ops = { + .domain_init = amd_iommu_domain_init, + .domain_destroy = amd_iommu_domain_destroy, + .attach_dev = amd_iommu_attach_device, + .detach_dev = amd_iommu_detach_device, + .map = amd_iommu_map_range, + .unmap = amd_iommu_unmap_range, + .iova_to_phys = amd_iommu_iova_to_phys, +}; + diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c625800c55c..42c33cebf00 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -122,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ -int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */ +bool amd_iommu_isolate = true; /* if true, device isolation is + enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -243,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -void __init iommu_enable(struct amd_iommu *iommu) +static void __init iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " - "at %02x:%02x.%x cap 0x%hx\n", - iommu->dev->bus->number, - PCI_SLOT(iommu->dev->devfn), - PCI_FUNC(iommu->dev->devfn), - iommu->cap_ptr); + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } /* Function to enable IOMMU event logging and event interrupts */ -void __init iommu_enable_event_logging(struct amd_iommu *iommu) +static void __init iommu_enable_event_logging(struct amd_iommu *iommu) { iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); @@ -1218,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = 1; + amd_iommu_isolate = true; if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = 0; + amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 99589245fd8..b13d3c4dbd4 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -98,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer); #ifdef HAVE_X2APIC int x2apic; /* x2apic enabled before OS handover */ -int x2apic_preenabled; -int disable_x2apic; +static int x2apic_preenabled; +static int disable_x2apic; static __init int setup_nox2apic(char *str) { disable_x2apic = 1; @@ -226,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id) apic_write(APIC_ICR, low); } -u64 xapic_icr_read(void) +static u64 xapic_icr_read(void) { u32 icr1, icr2; @@ -266,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id) wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } -u64 x2apic_icr_read(void) +static u64 x2apic_icr_read(void) { unsigned long val; diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 2a0a2a3cac2..f63882728d9 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -25,7 +25,7 @@ #include <asm/uv/bios.h> #include <asm/uv/uv_hub.h> -struct uv_systab uv_systab; +static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4e8d77f01ee..b59ddcc88cd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -14,14 +14,6 @@ #include <asm/pat.h> #include "mtrr.h" -struct mtrr_state { - struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; - mtrr_type fixed_ranges[NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - struct fixed_range_block { int base_msr; /* start address of an MTRR block */ int ranges; /* number of MTRRs in this block */ @@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = { }; static unsigned long smp_changes_mask; -static struct mtrr_state mtrr_state = {}; static int mtrr_state_set; u64 mtrr_tom2; +struct mtrr_state_type mtrr_state = {}; +EXPORT_SYMBOL_GPL(mtrr_state); + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1159e269e59..d259e5d2e05 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -49,7 +49,7 @@ u32 num_var_ranges = 0; -unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; @@ -574,7 +574,7 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; +static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { @@ -824,16 +824,14 @@ static int enable_mtrr_cleanup __initdata = static int __init disable_mtrr_cleanup_setup(char *str) { - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 0; + enable_mtrr_cleanup = 0; return 0; } early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); static int __init enable_mtrr_cleanup_setup(char *str) { - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 1; + enable_mtrr_cleanup = 1; return 0; } early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2dc4ec656b2..ffd60409cc6 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -8,11 +8,6 @@ #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff -#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) -#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) - -#define NUM_FIXED_RANGES 88 -#define MAX_VAR_RANGES 256 #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 @@ -29,11 +24,7 @@ #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 -/* In the Intel processor's MTRR interface, the MTRR type is always held in - an 8 bit field: */ -typedef u8 mtrr_type; - -extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; @@ -70,13 +61,6 @@ struct set_mtrr_context { u32 ccr3; }; -struct mtrr_var_range { - u32 base_lo; - u32 base_hi; - u32 mask_lo; - u32 mask_hi; -}; - void set_mtrr_done(struct set_mtrr_context *ctxt); void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 62a3c23bd70..2ac1f0c2beb 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -39,10 +39,10 @@ #include <linux/device.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/system.h> static struct class *cpuid_class; @@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) } static ssize_t cpuid_read(struct file *file, char __user *buf, - size_t count, loff_t * ppos) + size_t count, loff_t *ppos) { char __user *tmp = buf; struct cpuid_regs cmd; @@ -117,7 +117,7 @@ static int cpuid_open(struct inode *inode, struct file *file) unsigned int cpu; struct cpuinfo_x86 *c; int ret = 0; - + lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index d84a852e4cd..c689d19e35a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -26,6 +26,7 @@ #include <linux/kdebug.h> #include <asm/smp.h> #include <asm/reboot.h> +#include <asm/virtext.h> #include <mach_ipi.h> @@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) #endif crash_save_cpu(regs, cpu); + /* Disable VMX or SVM if needed. + * + * We need to disable virtualization on all CPUs. + * Having VMX or SVM enabled on any CPU may break rebooting + * after the kdump kernel has finished its task. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + disable_local_APIC(); } @@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs) local_irq_disable(); kdump_nmi_shootdown_cpus(); + + /* Booting kdump kernel with VMX or SVM enabled won't work, + * because (among other limitations) we can't disable paging + * with the virt flags. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 23b138e31e9..504ad198e4a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -886,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...) va_list ap; va_start(ap, fmt); - n = vscnprintf(buf, 512, fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); early_console->write(early_console, buf, n); va_end(ap); } diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 62895cf315f..21bcc0e098b 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -161,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb) return current_cpu_data.initial_apicid >> index_msb; } -void x2apic_send_IPI_self(int vector) +static void x2apic_send_IPI_self(int vector) { apic_write(APIC_SELF_IPI, vector); } -void init_x2apic_ldr(void) +static void init_x2apic_ldr(void) { return; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 388e05a5fc1..b9a4d8c4b93 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,7 +27,7 @@ #include <asm/trampoline.h> /* boot cpu pda */ -static struct x8664_pda _boot_cpu_pda __read_mostly; +static struct x8664_pda _boot_cpu_pda; #ifdef CONFIG_SMP /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index d39918076bb..df3bf269bea 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -10,7 +10,6 @@ #include <asm/pgtable.h> #include <asm/desc.h> -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index a25c3f76b8a..3639442aa7a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -170,7 +170,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { [15] = { .vector = IRQ15_VECTOR, }, }; -void __init arch_early_irq_init(void) +int __init arch_early_irq_init(void) { struct irq_cfg *cfg; struct irq_desc *desc; @@ -188,6 +188,8 @@ void __init arch_early_irq_init(void) if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } + + return 0; } #ifdef CONFIG_SPARSE_IRQ @@ -230,7 +232,7 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -void arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int cpu) { struct irq_cfg *cfg; @@ -242,6 +244,8 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu) BUG_ON(1); } } + + return 0; } #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC @@ -702,7 +706,7 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) } #ifdef CONFIG_X86_64 -void io_apic_sync(struct irq_pin_list *entry) +static void io_apic_sync(struct irq_pin_list *entry) { /* * Synchronize the IO-APIC and the CPU by doing @@ -1400,8 +1404,6 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { - if (!desc) - continue; cfg = desc->chip_data; if (!cpumask_test_cpu(cpu, cfg->domain)) continue; @@ -1783,8 +1785,6 @@ __apicdebuginit(void) print_IO_APIC(void) for_each_irq_desc(irq, desc) { struct irq_pin_list *entry; - if (!desc) - continue; cfg = desc->chip_data; entry = cfg->irq_2_pin; if (!entry) @@ -2425,9 +2425,6 @@ static void ir_irq_migration(struct work_struct *work) struct irq_desc *desc; for_each_irq_desc(irq, desc) { - if (!desc) - continue; - if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -2713,9 +2710,6 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_irq_desc(irq, desc) { - if (!desc) - continue; - cfg = desc->chip_data; if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e169ae9b6a6..652fce6d2cc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void) */ static unsigned long kvm_get_tsc_khz(void) { - return preset_lpj; + struct pvclock_vcpu_time_info *src; + src = &per_cpu(hv_clock, 0); + return pvclock_tsc_khz(src); } static void kvm_get_preset_lpj(void) { - struct pvclock_vcpu_time_info *src; unsigned long khz; u64 lpj; - src = &per_cpu(hv_clock, 0); - khz = pvclock_tsc_khz(src); + khz = kvm_get_tsc_khz(); lpj = ((u64)khz * 1000); do_div(lpj, HZ); @@ -194,5 +194,7 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; } } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index eee32b43fee..71f1d99a635 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,8 +12,8 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/vmalloc.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/system.h> #include <asm/ldt.h> #include <asm/desc.h> @@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) if (err < 0) return err; - for(i = 0; i < old->size; i++) + for (i = 0; i < old->size; i++) write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index efc2f361fe8..666e43df51f 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -13,8 +13,7 @@ #include <asm/msr.h> #include <asm/acpi.h> #include <asm/mmconfig.h> - -#include "../pci/pci.h" +#include <asm/pci_x86.h> struct pci_hostbridge_probe { u32 bus; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 45e3b69808b..c5c5b8df1db 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -16,14 +16,14 @@ #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/module.h> +#include <linux/smp.h> +#include <linux/acpi.h> -#include <asm/smp.h> #include <asm/mtrr.h> #include <asm/mpspec.h> #include <asm/pgalloc.h> #include <asm/io_apic.h> #include <asm/proto.h> -#include <asm/acpi.h> #include <asm/bios_ebda.h> #include <asm/e820.h> #include <asm/trampoline.h> @@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m) #endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { - set_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) + set_bit(m->mpc_busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { @@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) x86_quirks->mpc_oem_pci_bus(m); clear_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 8bd1bf9622a..45a09ccdc21 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -26,11 +26,10 @@ #include <linux/kernel_stat.h> #include <linux/kdebug.h> #include <linux/smp.h> +#include <linux/nmi.h> #include <asm/i8259.h> #include <asm/io_apic.h> -#include <asm/smp.h> -#include <asm/nmi.h> #include <asm/proto.h> #include <asm/timer.h> diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a35eaa379ff..00c2bcd4146 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */ * to trigger bugs with some popular PCI cards, in particular 3ware (but * has been also also seen with Qlogic at least). */ -int iommu_fullflush = 1; +static int iommu_fullflush = 1; /* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index de4a9d643be..2b46eb41643 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -12,6 +12,8 @@ #include <asm/proto.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> +#include <asm/pci_x86.h> +#include <asm/virtext.h> #ifdef CONFIG_X86_32 # include <linux/dmi.h> @@ -23,7 +25,6 @@ #include <mach_ipi.h> - /* * Power off function, if any */ @@ -39,6 +40,12 @@ int reboot_force; static int reboot_cpu = -1; #endif +/* This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ bool port_cf9_safe = false; @@ -368,6 +375,48 @@ static inline void kb_wait(void) } } +static void vmxoff_nmi(int cpu, struct die_args *args) +{ + cpu_emergency_vmxoff(); +} + +/* Use NMIs as IPIs to tell all CPUs to disable virtualization + */ +static void emergency_vmx_disable_all(void) +{ + /* Just make sure we won't change CPUs while doing this */ + local_irq_disable(); + + /* We need to disable VMX on all CPUs before rebooting, otherwise + * we risk hanging up the machine, because the CPU ignore INIT + * signals when VMX is enabled. + * + * We can't take any locks and we may be on an inconsistent + * state, so we use NMIs as IPIs to tell the other CPUs to disable + * VMX and halt. + * + * For safety, we will avoid running the nmi_shootdown_cpus() + * stuff unnecessarily, but we don't have a way to check + * if other CPUs have VMX enabled. So we will call it only if the + * CPU we are running on has VMX enabled. + * + * We will miss cases where VMX is not enabled on all CPUs. This + * shouldn't do much harm because KVM always enable VMX on all + * CPUs anyway. But we can miss it on the small window where KVM + * is still enabling VMX. + */ + if (cpu_has_vmx() && cpu_vmx_enabled()) { + /* Disable VMX on this CPU. + */ + cpu_vmxoff(); + + /* Halt and disable VMX on the other CPUs */ + nmi_shootdown_cpus(vmxoff_nmi); + + } +} + + void __attribute__((weak)) mach_reboot_fixups(void) { } @@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void) { int i; + if (reboot_emergency) + emergency_vmx_disable_all(); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -482,13 +534,19 @@ void native_machine_shutdown(void) #endif } +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + static void native_machine_restart(char *__unused) { printk("machine restart\n"); if (!reboot_force) machine_shutdown(); - machine_emergency_restart(); + __machine_emergency_restart(0); } static void native_machine_halt(void) @@ -532,7 +590,7 @@ void machine_shutdown(void) void machine_emergency_restart(void) { - machine_ops.emergency_restart(); + __machine_emergency_restart(1); } void machine_restart(char *cmd) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 6a00e5faaa7..f885023167e 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -582,7 +582,6 @@ static int __init uv_ptc_init(void) static struct bau_control * __init uv_table_bases_init(int blade, int node) { int i; - int *ip; struct bau_msg_status *msp; struct bau_control *bau_tabp; @@ -599,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node) bau_cpubits_clear(&msp->seen_by, (int) uv_blade_nr_possible_cpus(blade)); - bau_tabp->watching = - kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node); - BUG_ON(!bau_tabp->watching); - - for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) - *ip = 0; - uv_bau_table_bases[blade] = bau_tabp; return bau_tabp; @@ -628,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu, bcp->bau_msg_head = bau_tablesp->va_queue_first; bcp->va_queue_first = bau_tablesp->va_queue_first; bcp->va_queue_last = bau_tablesp->va_queue_last; - bcp->watching = bau_tablesp->watching; bcp->msg_statuses = bau_tablesp->msg_statuses; bcp->descriptor_base = adp; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2d1f4c7e405..ce6650eb64e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_no = 8; - /* This is always a kernel trap and never fixable (and thus must - never return). */ + /* + * This is always a kernel trap and never fixable (and thus must + * never return). + */ for (;;) die(str, regs, error_code); } @@ -520,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) } #ifdef CONFIG_X86_64 -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ +/* + * Help handler running on IST stack to switch back to user stack + * for scheduling or signal handling. The actual stack switch is done in + * entry.S + */ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; @@ -532,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ + /* + * Exception from kernel and interrupts are enabled. Move to + * kernel process stack. + */ else if (eregs->flags & X86_EFLAGS_IF) regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) @@ -685,12 +691,7 @@ void math_error(void __user *ip) cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - err = swd & ~cwd & 0x3f; - -#ifdef CONFIG_X86_32 - if (!err) - return; -#endif + err = swd & ~cwd; if (err & 0x001) { /* Invalid op */ /* @@ -708,7 +709,11 @@ void math_error(void __user *ip) } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; } else { - info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */ + /* + * If we're using IRQ 13, or supposedly even some trap 16 + * implementations, it's possible we get a spurious trap... + */ + return; /* Spurious trap, no error */ } force_sig_info(SIGFPE, &info, task); } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 15c3e699918..2b54fe002e9 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf) * Restore the extended state if present. Otherwise, restore the FP/SSE * state. */ -int restore_user_xstate(void __user *buf) +static int restore_user_xstate(void __user *buf) { struct _fpx_sw_bytes fx_sw_user; u64 mask; |