summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/amd_iommu.c666
-rw-r--r--arch/x86/kernel/amd_iommu_init.c19
-rw-r--r--arch/x86/kernel/apic.c8
-rw-r--r--arch/x86/kernel/bios_uv.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h18
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/crash.c18
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/genx2apic_phys.c4
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic.c20
-rw-r--r--arch/x86/kernel/kvmclock.c10
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c3
-rw-r--r--arch/x86/kernel/mpparse.c10
-rw-r--r--arch/x86/kernel/nmi.c3
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/reboot.c64
-rw-r--r--arch/x86/kernel/tlb_uv.c9
-rw-r--r--arch/x86/kernel/traps.c33
-rw-r--r--arch/x86/kernel/xsave.c2
24 files changed, 757 insertions, 171 deletions
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 2e2da717b35..5113c080f0c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -20,8 +20,12 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
+#include <linux/debugfs.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
+#ifdef CONFIG_IOMMU_API
+#include <linux/iommu.h>
+#endif
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
@@ -38,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
static LIST_HEAD(iommu_pd_list);
static DEFINE_SPINLOCK(iommu_pd_list_lock);
+#ifdef CONFIG_IOMMU_API
+static struct iommu_ops amd_iommu_ops;
+#endif
+
/*
* general struct to manage commands send to an IOMMU
*/
@@ -47,6 +55,68 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
+static struct dma_ops_domain *find_protection_domain(u16 devid);
+
+
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+/*
+ * Initialization code for statistics collection
+ */
+
+DECLARE_STATS_COUNTER(compl_wait);
+DECLARE_STATS_COUNTER(cnt_map_single);
+DECLARE_STATS_COUNTER(cnt_unmap_single);
+DECLARE_STATS_COUNTER(cnt_map_sg);
+DECLARE_STATS_COUNTER(cnt_unmap_sg);
+DECLARE_STATS_COUNTER(cnt_alloc_coherent);
+DECLARE_STATS_COUNTER(cnt_free_coherent);
+DECLARE_STATS_COUNTER(cross_page);
+DECLARE_STATS_COUNTER(domain_flush_single);
+DECLARE_STATS_COUNTER(domain_flush_all);
+DECLARE_STATS_COUNTER(alloced_io_mem);
+DECLARE_STATS_COUNTER(total_map_requests);
+
+static struct dentry *stats_dir;
+static struct dentry *de_isolate;
+static struct dentry *de_fflush;
+
+static void amd_iommu_stats_add(struct __iommu_counter *cnt)
+{
+ if (stats_dir == NULL)
+ return;
+
+ cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
+ &cnt->value);
+}
+
+static void amd_iommu_stats_init(void)
+{
+ stats_dir = debugfs_create_dir("amd-iommu", NULL);
+ if (stats_dir == NULL)
+ return;
+
+ de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
+ (u32 *)&amd_iommu_isolate);
+
+ de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
+ (u32 *)&amd_iommu_unmap_flush);
+
+ amd_iommu_stats_add(&compl_wait);
+ amd_iommu_stats_add(&cnt_map_single);
+ amd_iommu_stats_add(&cnt_unmap_single);
+ amd_iommu_stats_add(&cnt_map_sg);
+ amd_iommu_stats_add(&cnt_unmap_sg);
+ amd_iommu_stats_add(&cnt_alloc_coherent);
+ amd_iommu_stats_add(&cnt_free_coherent);
+ amd_iommu_stats_add(&cross_page);
+ amd_iommu_stats_add(&domain_flush_single);
+ amd_iommu_stats_add(&domain_flush_all);
+ amd_iommu_stats_add(&alloced_io_mem);
+ amd_iommu_stats_add(&total_map_requests);
+}
+
+#endif
/* returns !0 if the IOMMU is caching non-present entries in its TLB */
static int iommu_has_npcache(struct amd_iommu *iommu)
@@ -189,13 +259,55 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
spin_lock_irqsave(&iommu->lock, flags);
ret = __iommu_queue_command(iommu, cmd);
if (!ret)
- iommu->need_sync = 1;
+ iommu->need_sync = true;
spin_unlock_irqrestore(&iommu->lock, flags);
return ret;
}
/*
+ * This function waits until an IOMMU has completed a completion
+ * wait command
+ */
+static void __iommu_wait_for_completion(struct amd_iommu *iommu)
+{
+ int ready = 0;
+ unsigned status = 0;
+ unsigned long i = 0;
+
+ INC_STATS_COUNTER(compl_wait);
+
+ while (!ready && (i < EXIT_LOOP_COUNT)) {
+ ++i;
+ /* wait for the bit to become one */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
+ }
+
+ /* set bit back to zero */
+ status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
+ writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+ if (unlikely(i == EXIT_LOOP_COUNT))
+ panic("AMD IOMMU: Completion wait loop failed\n");
+}
+
+/*
+ * This function queues a completion wait command into the command
+ * buffer of an IOMMU
+ */
+static int __iommu_completion_wait(struct amd_iommu *iommu)
+{
+ struct iommu_cmd cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
+ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+ return __iommu_queue_command(iommu, &cmd);
+}
+
+/*
* This function is called whenever we need to ensure that the IOMMU has
* completed execution of all commands we sent. It sends a
* COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
@@ -204,40 +316,22 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
*/
static int iommu_completion_wait(struct amd_iommu *iommu)
{
- int ret = 0, ready = 0;
- unsigned status = 0;
- struct iommu_cmd cmd;
- unsigned long flags, i = 0;
-
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+ int ret = 0;
+ unsigned long flags;
spin_lock_irqsave(&iommu->lock, flags);
if (!iommu->need_sync)
goto out;
- iommu->need_sync = 0;
+ ret = __iommu_completion_wait(iommu);
- ret = __iommu_queue_command(iommu, &cmd);
+ iommu->need_sync = false;
if (ret)
goto out;
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
- }
-
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
- if (unlikely(i == EXIT_LOOP_COUNT))
- panic("AMD IOMMU: Completion wait loop failed\n");
+ __iommu_wait_for_completion(iommu);
out:
spin_unlock_irqrestore(&iommu->lock, flags);
@@ -264,6 +358,21 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
return ret;
}
+static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+ u16 domid, int pde, int s)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ address &= PAGE_MASK;
+ CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+ cmd->data[1] |= domid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ if (s) /* size bit - we flush more than one 4kb page */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
/*
* Generic command send function for invalidaing TLB entries
*/
@@ -273,16 +382,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
struct iommu_cmd cmd;
int ret;
- memset(&cmd, 0, sizeof(cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
- cmd.data[1] |= domid;
- cmd.data[2] = lower_32_bits(address);
- cmd.data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
ret = iommu_queue_command(iommu, &cmd);
@@ -321,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
{
u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ INC_STATS_COUNTER(domain_flush_single);
+
iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
}
+/*
+ * This function is used to flush the IO/TLB for a given protection domain
+ * on every IOMMU in the system
+ */
+static void iommu_flush_domain(u16 domid)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct iommu_cmd cmd;
+
+ INC_STATS_COUNTER(domain_flush_all);
+
+ __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ domid, 1, 1);
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ spin_lock_irqsave(&iommu->lock, flags);
+ __iommu_queue_command(iommu, &cmd);
+ __iommu_completion_wait(iommu);
+ __iommu_wait_for_completion(iommu);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -338,10 +464,10 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
* supporting all features of AMD IOMMU page tables like level skipping
* and full 64 bit address spaces.
*/
-static int iommu_map(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot)
+static int iommu_map_page(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long phys_addr,
+ int prot)
{
u64 __pte, *pte, *page;
@@ -388,6 +514,28 @@ static int iommu_map(struct protection_domain *dom,
return 0;
}
+static void iommu_unmap_page(struct protection_domain *dom,
+ unsigned long bus_addr)
+{
+ u64 *pte;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ *pte = 0;
+}
+
/*
* This function checks if a specific unity mapping entry is needed for
* this specific IOMMU.
@@ -440,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
- ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+ ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
if (ret)
return ret;
/*
@@ -571,6 +719,16 @@ static u16 domain_id_alloc(void)
return id;
}
+static void domain_id_free(int id)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __clear_bit(id, amd_iommu_pd_alloc_bitmap);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
/*
* Used to reserve address ranges in the aperture (e.g. for exclusion
* ranges.
@@ -587,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
iommu_area_reserve(dom->bitmap, start_page, pages);
}
-static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+static void free_pagetable(struct protection_domain *domain)
{
int i, j;
u64 *p1, *p2, *p3;
- p1 = dma_dom->domain.pt_root;
+ p1 = domain->pt_root;
if (!p1)
return;
@@ -613,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
}
free_page((unsigned long)p1);
+
+ domain->pt_root = NULL;
}
/*
@@ -624,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
if (!dom)
return;
- dma_ops_free_pagetable(dom);
+ free_pagetable(&dom->domain);
kfree(dom->pte_pages);
@@ -663,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
goto free_dma_dom;
dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ dma_dom->domain.flags = PD_DMA_OPS_MASK;
dma_dom->domain.priv = dma_dom;
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
@@ -725,6 +886,15 @@ free_dma_dom:
}
/*
+ * little helper function to check whether a given protection domain is a
+ * dma_ops domain
+ */
+static bool dma_ops_domain(struct protection_domain *domain)
+{
+ return domain->flags & PD_DMA_OPS_MASK;
+}
+
+/*
* Find out the protection domain structure for a given PCI device. This
* will give us the pointer to the page table root for example.
*/
@@ -744,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid)
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
-static void set_device_domain(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
+static void attach_device(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
{
unsigned long flags;
-
u64 pte_root = virt_to_phys(domain->pt_root);
+ domain->dev_cnt += 1;
+
pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
@@ -767,6 +938,116 @@ static void set_device_domain(struct amd_iommu *iommu,
iommu_queue_inv_dev_entry(iommu, devid);
}
+/*
+ * Removes a device from a protection domain (unlocked)
+ */
+static void __detach_device(struct protection_domain *domain, u16 devid)
+{
+
+ /* lock domain */
+ spin_lock(&domain->lock);
+
+ /* remove domain from the lookup table */
+ amd_iommu_pd_table[devid] = NULL;
+
+ /* remove entry from the device table seen by the hardware */
+ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
+ amd_iommu_dev_table[devid].data[1] = 0;
+ amd_iommu_dev_table[devid].data[2] = 0;
+
+ /* decrease reference counter */
+ domain->dev_cnt -= 1;
+
+ /* ready */
+ spin_unlock(&domain->lock);
+}
+
+/*
+ * Removes a device from a protection domain (with devtable_lock held)
+ */
+static void detach_device(struct protection_domain *domain, u16 devid)
+{
+ unsigned long flags;
+
+ /* lock device table */
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ __detach_device(domain, devid);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int device_change_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
+ struct protection_domain *domain;
+ struct dma_ops_domain *dma_domain;
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ unsigned long flags;
+
+ if (devid > amd_iommu_last_bdf)
+ goto out;
+
+ devid = amd_iommu_alias_table[devid];
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (iommu == NULL)
+ goto out;
+
+ domain = domain_for_device(devid);
+
+ if (domain && !dma_ops_domain(domain))
+ WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
+ "to a non-dma-ops domain\n", dev_name(dev));
+
+ switch (action) {
+ case BUS_NOTIFY_BOUND_DRIVER:
+ if (domain)
+ goto out;
+ dma_domain = find_protection_domain(devid);
+ if (!dma_domain)
+ dma_domain = iommu->default_dom;
+ attach_device(iommu, &dma_domain->domain, devid);
+ printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+ "device %s\n", dma_domain->domain.id, dev_name(dev));
+ break;
+ case BUS_NOTIFY_UNBIND_DRIVER:
+ if (!domain)
+ goto out;
+ detach_device(domain, devid);
+ break;
+ case BUS_NOTIFY_ADD_DEVICE:
+ /* allocate a protection domain if a device is added */
+ dma_domain = find_protection_domain(devid);
+ if (dma_domain)
+ goto out;
+ dma_domain = dma_ops_domain_alloc(iommu, order);
+ if (!dma_domain)
+ goto out;
+ dma_domain->target_dev = devid;
+
+ spin_lock_irqsave(&iommu_pd_list_lock, flags);
+ list_add_tail(&dma_domain->list, &iommu_pd_list);
+ spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+ break;
+ default:
+ goto out;
+ }
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_completion_wait(iommu);
+
+out:
+ return 0;
+}
+
+struct notifier_block device_nb = {
+ .notifier_call = device_change_notifier,
+};
+
/*****************************************************************************
*
* The next functions belong to the dma_ops mapping/unmapping code.
@@ -802,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid)
list_for_each_entry(entry, &iommu_pd_list, list) {
if (entry->target_dev == devid) {
ret = entry;
- list_del(&ret->list);
break;
}
}
@@ -853,14 +1133,13 @@ static int get_device_resources(struct device *dev,
if (!dma_dom)
dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
- set_device_domain(*iommu, *domain, *bdf);
+ attach_device(*iommu, *domain, *bdf);
printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device ", (*domain)->id);
- print_devid(_bdf, 1);
+ "device %s\n", (*domain)->id, dev_name(dev));
}
if (domain_for_device(_bdf) == NULL)
- set_device_domain(*iommu, *domain, _bdf);
+ attach_device(*iommu, *domain, _bdf);
return 1;
}
@@ -946,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev,
pages = iommu_num_pages(paddr, size, PAGE_SIZE);
paddr &= PAGE_MASK;
+ INC_STATS_COUNTER(total_map_requests);
+
+ if (pages > 1)
+ INC_STATS_COUNTER(cross_page);
+
if (align)
align_mask = (1UL << get_order(size)) - 1;
@@ -962,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev,
}
address += offset;
+ ADD_STATS_COUNTER(alloced_io_mem, size);
+
if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
iommu_flush_tlb(iommu, dma_dom->domain.id);
dma_dom->need_flush = false;
@@ -998,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu,
start += PAGE_SIZE;
}
+ SUB_STATS_COUNTER(alloced_io_mem, size);
+
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
@@ -1019,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
dma_addr_t addr;
u64 dma_mask;
+ INC_STATS_COUNTER(cnt_map_single);
+
if (!check_device(dev))
return bad_dma_address;
@@ -1030,6 +1320,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
/* device not handled by any AMD IOMMU */
return (dma_addr_t)paddr;
+ if (!dma_ops_domain(domain))
+ return bad_dma_address;
+
spin_lock_irqsave(&domain->lock, flags);
addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
dma_mask);
@@ -1055,11 +1348,16 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
struct protection_domain *domain;
u16 devid;
+ INC_STATS_COUNTER(cnt_unmap_single);
+
if (!check_device(dev) ||
!get_device_resources(dev, &iommu, &domain, &devid))
/* device not handled by any AMD IOMMU */
return;
+ if (!dma_ops_domain(domain))
+ return;
+
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, dir);
@@ -1104,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
int mapped_elems = 0;
u64 dma_mask;
+ INC_STATS_COUNTER(cnt_map_sg);
+
if (!check_device(dev))
return 0;
@@ -1114,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
if (!iommu || !domain)
return map_sg_no_iommu(dev, sglist, nelems, dir);
+ if (!dma_ops_domain(domain))
+ return 0;
+
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
@@ -1163,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
u16 devid;
int i;
+ INC_STATS_COUNTER(cnt_unmap_sg);
+
if (!check_device(dev) ||
!get_device_resources(dev, &iommu, &domain, &devid))
return;
+ if (!dma_ops_domain(domain))
+ return;
+
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
@@ -1194,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size,
phys_addr_t paddr;
u64 dma_mask = dev->coherent_dma_mask;
+ INC_STATS_COUNTER(cnt_alloc_coherent);
+
if (!check_device(dev))
return NULL;
@@ -1212,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size,
return virt_addr;
}
+ if (!dma_ops_domain(domain))
+ goto out_free;
+
if (!dma_mask)
dma_mask = *dev->dma_mask;
@@ -1220,18 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size,
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address) {
- free_pages((unsigned long)virt_addr, get_order(size));
- virt_addr = NULL;
- goto out;
- }
+ if (*dma_addr == bad_dma_address)
+ goto out_free;
iommu_completion_wait(iommu);
-out:
spin_unlock_irqrestore(&domain->lock, flags);
return virt_addr;
+
+out_free:
+
+ free_pages((unsigned long)virt_addr, get_order(size));
+
+ return NULL;
}
/*
@@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size,
struct protection_domain *domain;
u16 devid;
+ INC_STATS_COUNTER(cnt_free_coherent);
+
if (!check_device(dev))
return;
@@ -1253,6 +1570,9 @@ static void free_coherent(struct device *dev, size_t size,
if (!iommu || !domain)
goto free_mem;
+ if (!dma_ops_domain(domain))
+ goto free_mem;
+
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
@@ -1296,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
* we don't need to preallocate the protection domains anymore.
* For now we have to.
*/
-void prealloc_protection_domains(void)
+static void prealloc_protection_domains(void)
{
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
@@ -1305,7 +1625,7 @@ void prealloc_protection_domains(void)
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- devid = (dev->bus->number << 8) | dev->devfn;
+ devid = calc_devid(dev->bus->number, dev->devfn);
if (devid > amd_iommu_last_bdf)
continue;
devid = amd_iommu_alias_table[devid];
@@ -1352,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
iommu->default_dom = dma_ops_domain_alloc(iommu, order);
if (iommu->default_dom == NULL)
return -ENOMEM;
+ iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
ret = iommu_init_unity_mappings(iommu);
if (ret)
goto free_domains;
@@ -1375,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void)
/* Make the driver finally visible to the drivers */
dma_ops = &amd_iommu_dma_ops;
+ register_iommu(&amd_iommu_ops);
+
+ bus_register_notifier(&pci_bus_type, &device_nb);
+
+ amd_iommu_stats_init();
+
return 0;
free_domains:
@@ -1386,3 +1713,224 @@ free_domains:
return ret;
}
+
+/*****************************************************************************
+ *
+ * The following functions belong to the exported interface of AMD IOMMU
+ *
+ * This interface allows access to lower level functions of the IOMMU
+ * like protection domain handling and assignement of devices to domains
+ * which is not possible with the dma_ops interface.
+ *
+ *****************************************************************************/
+
+static void cleanup_domain(struct protection_domain *domain)
+{
+ unsigned long flags;
+ u16 devid;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
+ if (amd_iommu_pd_table[devid] == domain)
+ __detach_device(domain, devid);
+
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+ struct protection_domain *domain;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return -ENOMEM;
+
+ spin_lock_init(&domain->lock);
+ domain->mode = PAGE_MODE_3_LEVEL;
+ domain->id = domain_id_alloc();
+ if (!domain->id)
+ goto out_free;
+ domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!domain->pt_root)
+ goto out_free;
+
+ dom->priv = domain;
+
+ return 0;
+
+out_free:
+ kfree(domain);
+
+ return -ENOMEM;
+}
+
+static void amd_iommu_domain_destroy(struct iommu_domain *dom)
+{
+ struct protection_domain *domain = dom->priv;
+
+ if (!domain)
+ return;
+
+ if (domain->dev_cnt > 0)
+ cleanup_domain(domain);
+
+ BUG_ON(domain->dev_cnt != 0);
+
+ free_pagetable(domain);
+
+ domain_id_free(domain->id);
+
+ kfree(domain);
+
+ dom->priv = NULL;
+}
+
+static void amd_iommu_detach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct protection_domain *domain = dom->priv;
+ struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ u16 devid;
+
+ if (dev->bus != &pci_bus_type)
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+ if (devid > 0)
+ detach_device(domain, devid);
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return;
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_completion_wait(iommu);
+}
+
+static int amd_iommu_attach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct protection_domain *domain = dom->priv;
+ struct protection_domain *old_domain;
+ struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ u16 devid;
+
+ if (dev->bus != &pci_bus_type)
+ return -EINVAL;
+
+ pdev = to_pci_dev(dev);
+
+ devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+ if (devid >= amd_iommu_last_bdf ||
+ devid != amd_iommu_alias_table[devid])
+ return -EINVAL;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return -EINVAL;
+
+ old_domain = domain_for_device(devid);
+ if (old_domain)
+ return -EBUSY;
+
+ attach_device(iommu, domain, devid);
+
+ iommu_completion_wait(iommu);
+
+ return 0;
+}
+
+static int amd_iommu_map_range(struct iommu_domain *dom,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, int iommu_prot)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
+ int prot = 0;
+ int ret;
+
+ if (iommu_prot & IOMMU_READ)
+ prot |= IOMMU_PROT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ prot |= IOMMU_PROT_IW;
+
+ iova &= PAGE_MASK;
+ paddr &= PAGE_MASK;
+
+ for (i = 0; i < npages; ++i) {
+ ret = iommu_map_page(domain, iova, paddr, prot);
+ if (ret)
+ return ret;
+
+ iova += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+static void amd_iommu_unmap_range(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
+{
+
+ struct protection_domain *domain = dom->priv;
+ unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
+
+ iova &= PAGE_MASK;
+
+ for (i = 0; i < npages; ++i) {
+ iommu_unmap_page(domain, iova);
+ iova += PAGE_SIZE;
+ }
+
+ iommu_flush_domain(domain->id);
+}
+
+static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
+ unsigned long iova)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long offset = iova & ~PAGE_MASK;
+ phys_addr_t paddr;
+ u64 *pte;
+
+ pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ paddr = *pte & IOMMU_PAGE_MASK;
+ paddr |= offset;
+
+ return paddr;
+}
+
+static struct iommu_ops amd_iommu_ops = {
+ .domain_init = amd_iommu_domain_init,
+ .domain_destroy = amd_iommu_domain_destroy,
+ .attach_dev = amd_iommu_attach_device,
+ .detach_dev = amd_iommu_detach_device,
+ .map = amd_iommu_map_range,
+ .unmap = amd_iommu_unmap_range,
+ .iova_to_phys = amd_iommu_iova_to_phys,
+};
+
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c625800c55c..42c33cebf00 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -122,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
-int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */
+bool amd_iommu_isolate = true; /* if true, device isolation is
+ enabled */
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -243,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
}
/* Function to enable the hardware */
-void __init iommu_enable(struct amd_iommu *iommu)
+static void __init iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
- "at %02x:%02x.%x cap 0x%hx\n",
- iommu->dev->bus->number,
- PCI_SLOT(iommu->dev->devfn),
- PCI_FUNC(iommu->dev->devfn),
- iommu->cap_ptr);
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+ dev_name(&iommu->dev->dev), iommu->cap_ptr);
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
/* Function to enable IOMMU event logging and event interrupts */
-void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
{
iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
@@ -1218,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
if (strncmp(str, "isolate", 7) == 0)
- amd_iommu_isolate = 1;
+ amd_iommu_isolate = true;
if (strncmp(str, "share", 5) == 0)
- amd_iommu_isolate = 0;
+ amd_iommu_isolate = false;
if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
}
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 99589245fd8..b13d3c4dbd4 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -98,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer);
#ifdef HAVE_X2APIC
int x2apic;
/* x2apic enabled before OS handover */
-int x2apic_preenabled;
-int disable_x2apic;
+static int x2apic_preenabled;
+static int disable_x2apic;
static __init int setup_nox2apic(char *str)
{
disable_x2apic = 1;
@@ -226,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id)
apic_write(APIC_ICR, low);
}
-u64 xapic_icr_read(void)
+static u64 xapic_icr_read(void)
{
u32 icr1, icr2;
@@ -266,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id)
wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
}
-u64 x2apic_icr_read(void)
+static u64 x2apic_icr_read(void)
{
unsigned long val;
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 2a0a2a3cac2..f63882728d9 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,7 +25,7 @@
#include <asm/uv/bios.h>
#include <asm/uv/uv_hub.h>
-struct uv_systab uv_systab;
+static struct uv_systab uv_systab;
s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01ee..b59ddcc88cd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
#include <asm/pat.h>
#include "mtrr.h"
-struct mtrr_state {
- struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
- mtrr_type fixed_ranges[NUM_FIXED_RANGES];
- unsigned char enabled;
- unsigned char have_fixed;
- mtrr_type def_type;
-};
-
struct fixed_range_block {
int base_msr; /* start address of an MTRR block */
int ranges; /* number of MTRRs in this block */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
};
static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
static int mtrr_state_set;
u64 mtrr_tom2;
+struct mtrr_state_type mtrr_state = {};
+EXPORT_SYMBOL_GPL(mtrr_state);
+
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "mtrr."
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 1159e269e59..d259e5d2e05 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
u32 num_var_ranges = 0;
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
unsigned long lsize;
};
-static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
+static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
{
@@ -824,16 +824,14 @@ static int enable_mtrr_cleanup __initdata =
static int __init disable_mtrr_cleanup_setup(char *str)
{
- if (enable_mtrr_cleanup != -1)
- enable_mtrr_cleanup = 0;
+ enable_mtrr_cleanup = 0;
return 0;
}
early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
static int __init enable_mtrr_cleanup_setup(char *str)
{
- if (enable_mtrr_cleanup != -1)
- enable_mtrr_cleanup = 1;
+ enable_mtrr_cleanup = 1;
return 0;
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b2..ffd60409cc6 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
#define MTRRcap_MSR 0x0fe
#define MTRRdefType_MSR 0x2ff
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-
-#define NUM_FIXED_RANGES 88
-#define MAX_VAR_RANGES 256
#define MTRRfix64K_00000_MSR 0x250
#define MTRRfix16K_80000_MSR 0x258
#define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
- an 8 bit field: */
-typedef u8 mtrr_type;
-
-extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
u32 vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
u32 ccr3;
};
-struct mtrr_var_range {
- u32 base_lo;
- u32 base_hi;
- u32 mask_lo;
- u32 mask_hi;
-};
-
void set_mtrr_done(struct set_mtrr_context *ctxt);
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 62a3c23bd70..2ac1f0c2beb 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -39,10 +39,10 @@
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
static struct class *cpuid_class;
@@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
}
static ssize_t cpuid_read(struct file *file, char __user *buf,
- size_t count, loff_t * ppos)
+ size_t count, loff_t *ppos)
{
char __user *tmp = buf;
struct cpuid_regs cmd;
@@ -117,7 +117,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
unsigned int cpu;
struct cpuinfo_x86 *c;
int ret = 0;
-
+
lock_kernel();
cpu = iminor(file->f_path.dentry->d_inode);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd..c689d19e35a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
#include <linux/kdebug.h>
#include <asm/smp.h>
#include <asm/reboot.h>
+#include <asm/virtext.h>
#include <mach_ipi.h>
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
#endif
crash_save_cpu(regs, cpu);
+ /* Disable VMX or SVM if needed.
+ *
+ * We need to disable virtualization on all CPUs.
+ * Having VMX or SVM enabled on any CPU may break rebooting
+ * after the kdump kernel has finished its task.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
disable_local_APIC();
}
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
local_irq_disable();
kdump_nmi_shootdown_cpus();
+
+ /* Booting kdump kernel with VMX or SVM enabled won't work,
+ * because (among other limitations) we can't disable paging
+ * with the virt flags.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
lapic_shutdown();
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 23b138e31e9..504ad198e4a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -886,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...)
va_list ap;
va_start(ap, fmt);
- n = vscnprintf(buf, 512, fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
early_console->write(early_console, buf, n);
va_end(ap);
}
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index 62895cf315f..21bcc0e098b 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -161,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb)
return current_cpu_data.initial_apicid >> index_msb;
}
-void x2apic_send_IPI_self(int vector)
+static void x2apic_send_IPI_self(int vector)
{
apic_write(APIC_SELF_IPI, vector);
}
-void init_x2apic_ldr(void)
+static void init_x2apic_ldr(void)
{
return;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 388e05a5fc1..b9a4d8c4b93 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,7 +27,7 @@
#include <asm/trampoline.h>
/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
+static struct x8664_pda _boot_cpu_pda;
#ifdef CONFIG_SMP
/*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index d39918076bb..df3bf269bea 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,7 +10,6 @@
#include <asm/pgtable.h>
#include <asm/desc.h>
-static struct fs_struct init_fs = INIT_FS;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index a25c3f76b8a..3639442aa7a 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -170,7 +170,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
[15] = { .vector = IRQ15_VECTOR, },
};
-void __init arch_early_irq_init(void)
+int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
struct irq_desc *desc;
@@ -188,6 +188,8 @@ void __init arch_early_irq_init(void)
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
+
+ return 0;
}
#ifdef CONFIG_SPARSE_IRQ
@@ -230,7 +232,7 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
return cfg;
}
-void arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
{
struct irq_cfg *cfg;
@@ -242,6 +244,8 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu)
BUG_ON(1);
}
}
+
+ return 0;
}
#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
@@ -702,7 +706,7 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
}
#ifdef CONFIG_X86_64
-void io_apic_sync(struct irq_pin_list *entry)
+static void io_apic_sync(struct irq_pin_list *entry)
{
/*
* Synchronize the IO-APIC and the CPU by doing
@@ -1400,8 +1404,6 @@ void __setup_vector_irq(int cpu)
/* Mark the inuse vectors */
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
cfg = desc->chip_data;
if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
@@ -1783,8 +1785,6 @@ __apicdebuginit(void) print_IO_APIC(void)
for_each_irq_desc(irq, desc) {
struct irq_pin_list *entry;
- if (!desc)
- continue;
cfg = desc->chip_data;
entry = cfg->irq_2_pin;
if (!entry)
@@ -2425,9 +2425,6 @@ static void ir_irq_migration(struct work_struct *work)
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
if (desc->status & IRQ_MOVE_PENDING) {
unsigned long flags;
@@ -2713,9 +2710,6 @@ static inline void init_IO_APIC_traps(void)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
cfg = desc->chip_data;
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e169ae9b6a6..652fce6d2cc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- return preset_lpj;
+ struct pvclock_vcpu_time_info *src;
+ src = &per_cpu(hv_clock, 0);
+ return pvclock_tsc_khz(src);
}
static void kvm_get_preset_lpj(void)
{
- struct pvclock_vcpu_time_info *src;
unsigned long khz;
u64 lpj;
- src = &per_cpu(hv_clock, 0);
- khz = pvclock_tsc_khz(src);
+ khz = kvm_get_tsc_khz();
lpj = ((u64)khz * 1000);
do_div(lpj, HZ);
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register(&kvm_clock);
+ pv_info.paravirt_enabled = 1;
+ pv_info.name = "KVM";
}
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index eee32b43fee..71f1d99a635 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,8 +12,8 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
@@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
if (err < 0)
return err;
- for(i = 0; i < old->size; i++)
+ for (i = 0; i < old->size; i++)
write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
return 0;
}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index efc2f361fe8..666e43df51f 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -13,8 +13,7 @@
#include <asm/msr.h>
#include <asm/acpi.h>
#include <asm/mmconfig.h>
-
-#include "../pci/pci.h"
+#include <asm/pci_x86.h>
struct pci_hostbridge_probe {
u32 bus;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 45e3b69808b..c5c5b8df1db 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -16,14 +16,14 @@
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/acpi.h>
-#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
-#include <asm/acpi.h>
#include <asm/bios_ebda.h>
#include <asm/e820.h>
#include <asm/trampoline.h>
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
#endif
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
- set_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+ set_bit(m->mpc_busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
x86_quirks->mpc_oem_pci_bus(m);
clear_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8bd1bf9622a..45a09ccdc21 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -26,11 +26,10 @@
#include <linux/kernel_stat.h>
#include <linux/kdebug.h>
#include <linux/smp.h>
+#include <linux/nmi.h>
#include <asm/i8259.h>
#include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/nmi.h>
#include <asm/proto.h>
#include <asm/timer.h>
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a35eaa379ff..00c2bcd4146 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */
* to trigger bugs with some popular PCI cards, in particular 3ware (but
* has been also also seen with Qlogic at least).
*/
-int iommu_fullflush = 1;
+static int iommu_fullflush = 1;
/* Allocation bitmap for the remapping area: */
static DEFINE_SPINLOCK(iommu_bitmap_lock);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index de4a9d643be..2b46eb41643 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,8 @@
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
+#include <asm/pci_x86.h>
+#include <asm/virtext.h>
#ifdef CONFIG_X86_32
# include <linux/dmi.h>
@@ -23,7 +25,6 @@
#include <mach_ipi.h>
-
/*
* Power off function, if any
*/
@@ -39,6 +40,12 @@ int reboot_force;
static int reboot_cpu = -1;
#endif
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
@@ -368,6 +375,48 @@ static inline void kb_wait(void)
}
}
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+ cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+ /* Just make sure we won't change CPUs while doing this */
+ local_irq_disable();
+
+ /* We need to disable VMX on all CPUs before rebooting, otherwise
+ * we risk hanging up the machine, because the CPU ignore INIT
+ * signals when VMX is enabled.
+ *
+ * We can't take any locks and we may be on an inconsistent
+ * state, so we use NMIs as IPIs to tell the other CPUs to disable
+ * VMX and halt.
+ *
+ * For safety, we will avoid running the nmi_shootdown_cpus()
+ * stuff unnecessarily, but we don't have a way to check
+ * if other CPUs have VMX enabled. So we will call it only if the
+ * CPU we are running on has VMX enabled.
+ *
+ * We will miss cases where VMX is not enabled on all CPUs. This
+ * shouldn't do much harm because KVM always enable VMX on all
+ * CPUs anyway. But we can miss it on the small window where KVM
+ * is still enabling VMX.
+ */
+ if (cpu_has_vmx() && cpu_vmx_enabled()) {
+ /* Disable VMX on this CPU.
+ */
+ cpu_vmxoff();
+
+ /* Halt and disable VMX on the other CPUs */
+ nmi_shootdown_cpus(vmxoff_nmi);
+
+ }
+}
+
+
void __attribute__((weak)) mach_reboot_fixups(void)
{
}
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void)
{
int i;
+ if (reboot_emergency)
+ emergency_vmx_disable_all();
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -482,13 +534,19 @@ void native_machine_shutdown(void)
#endif
}
+static void __machine_emergency_restart(int emergency)
+{
+ reboot_emergency = emergency;
+ machine_ops.emergency_restart();
+}
+
static void native_machine_restart(char *__unused)
{
printk("machine restart\n");
if (!reboot_force)
machine_shutdown();
- machine_emergency_restart();
+ __machine_emergency_restart(0);
}
static void native_machine_halt(void)
@@ -532,7 +590,7 @@ void machine_shutdown(void)
void machine_emergency_restart(void)
{
- machine_ops.emergency_restart();
+ __machine_emergency_restart(1);
}
void machine_restart(char *cmd)
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6a00e5faaa7..f885023167e 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -582,7 +582,6 @@ static int __init uv_ptc_init(void)
static struct bau_control * __init uv_table_bases_init(int blade, int node)
{
int i;
- int *ip;
struct bau_msg_status *msp;
struct bau_control *bau_tabp;
@@ -599,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
bau_cpubits_clear(&msp->seen_by, (int)
uv_blade_nr_possible_cpus(blade));
- bau_tabp->watching =
- kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
- BUG_ON(!bau_tabp->watching);
-
- for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
- *ip = 0;
-
uv_bau_table_bases[blade] = bau_tabp;
return bau_tabp;
@@ -628,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu,
bcp->bau_msg_head = bau_tablesp->va_queue_first;
bcp->va_queue_first = bau_tablesp->va_queue_first;
bcp->va_queue_last = bau_tablesp->va_queue_last;
- bcp->watching = bau_tablesp->watching;
bcp->msg_statuses = bau_tablesp->msg_statuses;
bcp->descriptor_base = adp;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2d1f4c7e405..ce6650eb64e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 8;
- /* This is always a kernel trap and never fixable (and thus must
- never return). */
+ /*
+ * This is always a kernel trap and never fixable (and thus must
+ * never return).
+ */
for (;;)
die(str, regs, error_code);
}
@@ -520,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
}
#ifdef CONFIG_X86_64
-/* Help handler running on IST stack to switch back to user stack
- for scheduling or signal handling. The actual stack switch is done in
- entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
@@ -532,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
/* Exception from user space */
else if (user_mode(eregs))
regs = task_pt_regs(current);
- /* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ /*
+ * Exception from kernel and interrupts are enabled. Move to
+ * kernel process stack.
+ */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -685,12 +691,7 @@ void math_error(void __user *ip)
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
- err = swd & ~cwd & 0x3f;
-
-#ifdef CONFIG_X86_32
- if (!err)
- return;
-#endif
+ err = swd & ~cwd;
if (err & 0x001) { /* Invalid op */
/*
@@ -708,7 +709,11 @@ void math_error(void __user *ip)
} else if (err & 0x020) { /* Precision */
info.si_code = FPE_FLTRES;
} else {
- info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
+ /*
+ * If we're using IRQ 13, or supposedly even some trap 16
+ * implementations, it's possible we get a spurious trap...
+ */
+ return; /* Spurious trap, no error */
}
force_sig_info(SIGFPE, &info, task);
}
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 15c3e699918..2b54fe002e9 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf)
* Restore the extended state if present. Otherwise, restore the FP/SSE
* state.
*/
-int restore_user_xstate(void __user *buf)
+static int restore_user_xstate(void __user *buf)
{
struct _fpx_sw_bytes fx_sw_user;
u64 mask;