summaryrefslogtreecommitdiffstats
path: root/drivers/iommu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/Kconfig13
-rw-r--r--drivers/iommu/Makefile1
-rw-r--r--drivers/iommu/amd_iommu.c862
-rw-r--r--drivers/iommu/amd_iommu_init.c133
-rw-r--r--drivers/iommu/amd_iommu_proto.h24
-rw-r--r--drivers/iommu/amd_iommu_types.h118
-rw-r--r--drivers/iommu/amd_iommu_v2.c994
-rw-r--r--drivers/iommu/intel-iommu.c37
-rw-r--r--drivers/iommu/intr_remapping.c2
-rw-r--r--drivers/iommu/iommu.c121
-rw-r--r--drivers/iommu/msm_iommu.c25
-rw-r--r--drivers/iommu/omap-iommu.c18
-rw-r--r--drivers/iommu/omap-iovmm.c17
13 files changed, 2255 insertions, 110 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 5414253b185..6bea6962f8e 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -34,7 +34,9 @@ config AMD_IOMMU
bool "AMD IOMMU support"
select SWIOTLB
select PCI_MSI
- select PCI_IOV
+ select PCI_ATS
+ select PCI_PRI
+ select PCI_PASID
select IOMMU_API
depends on X86_64 && PCI && ACPI
---help---
@@ -58,6 +60,15 @@ config AMD_IOMMU_STATS
information to userspace via debugfs.
If unsure, say N.
+config AMD_IOMMU_V2
+ tristate "AMD IOMMU Version 2 driver (EXPERIMENTAL)"
+ depends on AMD_IOMMU && PROFILING && EXPERIMENTAL
+ select MMU_NOTIFIER
+ ---help---
+ This option enables support for the AMD IOMMUv2 features of the IOMMU
+ hardware. Select this option if you want to use devices that support
+ the the PCI PRI and PASID interface.
+
# Intel IOMMU support
config DMAR_TABLE
bool
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 2f4448794bc..0e36b4934af 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_IOMMU_API) += iommu.o
obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
+obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
obj-$(CONFIG_DMAR_TABLE) += dmar.o
obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o
obj-$(CONFIG_IRQ_REMAP) += intr_remapping.o
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 4ee277a8521..edd291070b0 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -17,6 +17,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/ratelimit.h>
#include <linux/pci.h>
#include <linux/pci-ats.h>
#include <linux/bitmap.h>
@@ -28,6 +29,8 @@
#include <linux/iommu.h>
#include <linux/delay.h>
#include <linux/amd-iommu.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
#include <asm/msidef.h>
#include <asm/proto.h>
#include <asm/iommu.h>
@@ -41,6 +44,24 @@
#define LOOP_TIMEOUT 100000
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KiB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KiB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+#define AMD_IOMMU_PGSIZES (~0xFFFUL)
+
static DEFINE_RWLOCK(amd_iommu_devtable_lock);
/* A list of preallocated protection domains */
@@ -59,6 +80,9 @@ static struct protection_domain *pt_domain;
static struct iommu_ops amd_iommu_ops;
+static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
+int amd_iommu_max_glx_val = -1;
+
/*
* general struct to manage commands send to an IOMMU
*/
@@ -67,6 +91,7 @@ struct iommu_cmd {
};
static void update_domain(struct protection_domain *domain);
+static int __init alloc_passthrough_domain(void);
/****************************************************************************
*
@@ -147,6 +172,33 @@ static struct iommu_dev_data *get_dev_data(struct device *dev)
return dev->archdata.iommu;
}
+static bool pci_iommuv2_capable(struct pci_dev *pdev)
+{
+ static const int caps[] = {
+ PCI_EXT_CAP_ID_ATS,
+ PCI_EXT_CAP_ID_PRI,
+ PCI_EXT_CAP_ID_PASID,
+ };
+ int i, pos;
+
+ for (i = 0; i < 3; ++i) {
+ pos = pci_find_ext_capability(pdev, caps[i]);
+ if (pos == 0)
+ return false;
+ }
+
+ return true;
+}
+
+static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
+{
+ struct iommu_dev_data *dev_data;
+
+ dev_data = get_dev_data(&pdev->dev);
+
+ return dev_data->errata & (1 << erratum) ? true : false;
+}
+
/*
* In this function the list of preallocated protection domains is traversed to
* find the domain for a specific device
@@ -204,6 +256,7 @@ static bool check_device(struct device *dev)
static int iommu_init_device(struct device *dev)
{
+ struct pci_dev *pdev = to_pci_dev(dev);
struct iommu_dev_data *dev_data;
u16 alias;
@@ -228,6 +281,13 @@ static int iommu_init_device(struct device *dev)
dev_data->alias_data = alias_data;
}
+ if (pci_iommuv2_capable(pdev)) {
+ struct amd_iommu *iommu;
+
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+ dev_data->iommu_v2 = iommu->is_iommu_v2;
+ }
+
dev->archdata.iommu = dev_data;
return 0;
@@ -317,6 +377,11 @@ DECLARE_STATS_COUNTER(domain_flush_single);
DECLARE_STATS_COUNTER(domain_flush_all);
DECLARE_STATS_COUNTER(alloced_io_mem);
DECLARE_STATS_COUNTER(total_map_requests);
+DECLARE_STATS_COUNTER(complete_ppr);
+DECLARE_STATS_COUNTER(invalidate_iotlb);
+DECLARE_STATS_COUNTER(invalidate_iotlb_all);
+DECLARE_STATS_COUNTER(pri_requests);
+
static struct dentry *stats_dir;
static struct dentry *de_fflush;
@@ -351,6 +416,10 @@ static void amd_iommu_stats_init(void)
amd_iommu_stats_add(&domain_flush_all);
amd_iommu_stats_add(&alloced_io_mem);
amd_iommu_stats_add(&total_map_requests);
+ amd_iommu_stats_add(&complete_ppr);
+ amd_iommu_stats_add(&invalidate_iotlb);
+ amd_iommu_stats_add(&invalidate_iotlb_all);
+ amd_iommu_stats_add(&pri_requests);
}
#endif
@@ -365,8 +434,8 @@ static void dump_dte_entry(u16 devid)
{
int i;
- for (i = 0; i < 8; ++i)
- pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
+ for (i = 0; i < 4; ++i)
+ pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
amd_iommu_dev_table[devid].data[i]);
}
@@ -461,12 +530,84 @@ static void iommu_poll_events(struct amd_iommu *iommu)
spin_unlock_irqrestore(&iommu->lock, flags);
}
+static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head)
+{
+ struct amd_iommu_fault fault;
+ volatile u64 *raw;
+ int i;
+
+ INC_STATS_COUNTER(pri_requests);
+
+ raw = (u64 *)(iommu->ppr_log + head);
+
+ /*
+ * Hardware bug: Interrupt may arrive before the entry is written to
+ * memory. If this happens we need to wait for the entry to arrive.
+ */
+ for (i = 0; i < LOOP_TIMEOUT; ++i) {
+ if (PPR_REQ_TYPE(raw[0]) != 0)
+ break;
+ udelay(1);
+ }
+
+ if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
+ pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
+ return;
+ }
+
+ fault.address = raw[1];
+ fault.pasid = PPR_PASID(raw[0]);
+ fault.device_id = PPR_DEVID(raw[0]);
+ fault.tag = PPR_TAG(raw[0]);
+ fault.flags = PPR_FLAGS(raw[0]);
+
+ /*
+ * To detect the hardware bug we need to clear the entry
+ * to back to zero.
+ */
+ raw[0] = raw[1] = 0;
+
+ atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
+}
+
+static void iommu_poll_ppr_log(struct amd_iommu *iommu)
+{
+ unsigned long flags;
+ u32 head, tail;
+
+ if (iommu->ppr_log == NULL)
+ return;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+
+ while (head != tail) {
+
+ /* Handle PPR entry */
+ iommu_handle_ppr_entry(iommu, head);
+
+ /* Update and refresh ring-buffer state*/
+ head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
+ writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+ }
+
+ /* enable ppr interrupts again */
+ writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
irqreturn_t amd_iommu_int_thread(int irq, void *data)
{
struct amd_iommu *iommu;
- for_each_iommu(iommu)
+ for_each_iommu(iommu) {
iommu_poll_events(iommu);
+ iommu_poll_ppr_log(iommu);
+ }
return IRQ_HANDLED;
}
@@ -595,6 +736,60 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
}
+static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
+ u64 address, bool size)
+{
+ memset(cmd, 0, sizeof(*cmd));
+
+ address &= ~(0xfffULL);
+
+ cmd->data[0] = pasid & PASID_MASK;
+ cmd->data[1] = domid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+ if (size)
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+}
+
+static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
+ int qdep, u64 address, bool size)
+{
+ memset(cmd, 0, sizeof(*cmd));
+
+ address &= ~(0xfffULL);
+
+ cmd->data[0] = devid;
+ cmd->data[0] |= (pasid & 0xff) << 16;
+ cmd->data[0] |= (qdep & 0xff) << 24;
+ cmd->data[1] = devid;
+ cmd->data[1] |= ((pasid >> 8) & 0xfff) << 16;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+ cmd->data[3] = upper_32_bits(address);
+ if (size)
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+}
+
+static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
+ int status, int tag, bool gn)
+{
+ memset(cmd, 0, sizeof(*cmd));
+
+ cmd->data[0] = devid;
+ if (gn) {
+ cmd->data[1] = pasid & PASID_MASK;
+ cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK;
+ }
+ cmd->data[3] = tag & 0x1ff;
+ cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
+
+ CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
+}
+
static void build_inv_all(struct iommu_cmd *cmd)
{
memset(cmd, 0, sizeof(*cmd));
@@ -1496,6 +1691,48 @@ static void free_pagetable(struct protection_domain *domain)
domain->pt_root = NULL;
}
+static void free_gcr3_tbl_level1(u64 *tbl)
+{
+ u64 *ptr;
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (!(tbl[i] & GCR3_VALID))
+ continue;
+
+ ptr = __va(tbl[i] & PAGE_MASK);
+
+ free_page((unsigned long)ptr);
+ }
+}
+
+static void free_gcr3_tbl_level2(u64 *tbl)
+{
+ u64 *ptr;
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (!(tbl[i] & GCR3_VALID))
+ continue;
+
+ ptr = __va(tbl[i] & PAGE_MASK);
+
+ free_gcr3_tbl_level1(ptr);
+ }
+}
+
+static void free_gcr3_table(struct protection_domain *domain)
+{
+ if (domain->glx == 2)
+ free_gcr3_tbl_level2(domain->gcr3_tbl);
+ else if (domain->glx == 1)
+ free_gcr3_tbl_level1(domain->gcr3_tbl);
+ else if (domain->glx != 0)
+ BUG();
+
+ free_page((unsigned long)domain->gcr3_tbl);
+}
+
/*
* Free a domain, only used if something went wrong in the
* allocation path and we need to free an already allocated page table
@@ -1582,20 +1819,52 @@ static bool dma_ops_domain(struct protection_domain *domain)
static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
{
- u64 pte_root = virt_to_phys(domain->pt_root);
- u32 flags = 0;
+ u64 pte_root = 0;
+ u64 flags = 0;
+
+ if (domain->mode != PAGE_MODE_NONE)
+ pte_root = virt_to_phys(domain->pt_root);
pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
+ flags = amd_iommu_dev_table[devid].data[1];
+
if (ats)
flags |= DTE_FLAG_IOTLB;
- amd_iommu_dev_table[devid].data[3] |= flags;
- amd_iommu_dev_table[devid].data[2] = domain->id;
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+ if (domain->flags & PD_IOMMUV2_MASK) {
+ u64 gcr3 = __pa(domain->gcr3_tbl);
+ u64 glx = domain->glx;
+ u64 tmp;
+
+ pte_root |= DTE_FLAG_GV;
+ pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+
+ /* First mask out possible old values for GCR3 table */
+ tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+ flags &= ~tmp;
+
+ tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+ flags &= ~tmp;
+
+ /* Encode GCR3 table into DTE */
+ tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
+ pte_root |= tmp;
+
+ tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
+ flags |= tmp;
+
+ tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
+ flags |= tmp;
+ }
+
+ flags &= ~(0xffffUL);
+ flags |= domain->id;
+
+ amd_iommu_dev_table[devid].data[1] = flags;
+ amd_iommu_dev_table[devid].data[0] = pte_root;
}
static void clear_dte_entry(u16 devid)
@@ -1603,7 +1872,6 @@ static void clear_dte_entry(u16 devid)
/* remove entry from the device table seen by the hardware */
amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
amd_iommu_apply_erratum_63(devid);
}
@@ -1696,6 +1964,93 @@ out_unlock:
return ret;
}
+
+static void pdev_iommuv2_disable(struct pci_dev *pdev)
+{
+ pci_disable_ats(pdev);
+ pci_disable_pri(pdev);
+ pci_disable_pasid(pdev);
+}
+
+/* FIXME: Change generic reset-function to do the same */
+static int pri_reset_while_enabled(struct pci_dev *pdev)
+{
+ u16 control;
+ int pos;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+ if (!pos)
+ return -EINVAL;
+
+ pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+ control |= PCI_PRI_CTRL_RESET;
+ pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+
+ return 0;
+}
+
+static int pdev_iommuv2_enable(struct pci_dev *pdev)
+{
+ bool reset_enable;
+ int reqs, ret;
+
+ /* FIXME: Hardcode number of outstanding requests for now */
+ reqs = 32;
+ if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
+ reqs = 1;
+ reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
+
+ /* Only allow access to user-accessible pages */
+ ret = pci_enable_pasid(pdev, 0);
+ if (ret)
+ goto out_err;
+
+ /* First reset the PRI state of the device */
+ ret = pci_reset_pri(pdev);
+ if (ret)
+ goto out_err;
+
+ /* Enable PRI */
+ ret = pci_enable_pri(pdev, reqs);
+ if (ret)
+ goto out_err;
+
+ if (reset_enable) {
+ ret = pri_reset_while_enabled(pdev);
+ if (ret)
+ goto out_err;
+ }
+
+ ret = pci_enable_ats(pdev, PAGE_SHIFT);
+ if (ret)
+ goto out_err;
+
+ return 0;
+
+out_err:
+ pci_disable_pri(pdev);
+ pci_disable_pasid(pdev);
+
+ return ret;
+}
+
+/* FIXME: Move this to PCI code */
+#define PCI_PRI_TLP_OFF (1 << 2)
+
+bool pci_pri_tlp_required(struct pci_dev *pdev)
+{
+ u16 control;
+ int pos;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+ if (!pos)
+ return false;
+
+ pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+
+ return (control & PCI_PRI_TLP_OFF) ? true : false;
+}
+
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
@@ -1710,7 +2065,18 @@ static int attach_device(struct device *dev,
dev_data = get_dev_data(dev);
- if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
+ if (domain->flags & PD_IOMMUV2_MASK) {
+ if (!dev_data->iommu_v2 || !dev_data->passthrough)
+ return -EINVAL;
+
+ if (pdev_iommuv2_enable(pdev) != 0)
+ return -EINVAL;
+
+ dev_data->ats.enabled = true;
+ dev_data->ats.qdep = pci_ats_queue_depth(pdev);
+ dev_data->pri_tlp = pci_pri_tlp_required(pdev);
+ } else if (amd_iommu_iotlb_sup &&
+ pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
dev_data->ats.enabled = true;
dev_data->ats.qdep = pci_ats_queue_depth(pdev);
}
@@ -1760,7 +2126,7 @@ static void __detach_device(struct iommu_dev_data *dev_data)
* passthrough domain if it is detached from any other domain.
* Make sure we can deassign from the pt_domain itself.
*/
- if (iommu_pass_through &&
+ if (dev_data->passthrough &&
(dev_data->domain == NULL && domain != pt_domain))
__attach_device(dev_data, pt_domain);
}
@@ -1770,20 +2136,24 @@ static void __detach_device(struct iommu_dev_data *dev_data)
*/
static void detach_device(struct device *dev)
{
+ struct protection_domain *domain;
struct iommu_dev_data *dev_data;
unsigned long flags;
dev_data = get_dev_data(dev);
+ domain = dev_data->domain;
/* lock device table */
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
__detach_device(dev_data);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
- if (dev_data->ats.enabled) {
+ if (domain->flags & PD_IOMMUV2_MASK)
+ pdev_iommuv2_disable(to_pci_dev(dev));
+ else if (dev_data->ats.enabled)
pci_disable_ats(to_pci_dev(dev));
- dev_data->ats.enabled = false;
- }
+
+ dev_data->ats.enabled = false;
}
/*
@@ -1818,18 +2188,20 @@ static struct protection_domain *domain_for_device(struct device *dev)
static int device_change_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
- struct device *dev = data;
- u16 devid;
- struct protection_domain *domain;
struct dma_ops_domain *dma_domain;
+ struct protection_domain *domain;
+ struct iommu_dev_data *dev_data;
+ struct device *dev = data;
struct amd_iommu *iommu;
unsigned long flags;
+ u16 devid;
if (!check_device(dev))
return 0;
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
+ dev_data = get_dev_data(dev);
switch (action) {
case BUS_NOTIFY_UNBOUND_DRIVER:
@@ -1838,7 +2210,7 @@ static int device_change_notifier(struct notifier_block *nb,
if (!domain)
goto out;
- if (iommu_pass_through)
+ if (dev_data->passthrough)
break;
detach_device(dev);
break;
@@ -2434,8 +2806,9 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
*/
static void prealloc_protection_domains(void)
{
- struct pci_dev *dev = NULL;
+ struct iommu_dev_data *dev_data;
struct dma_ops_domain *dma_dom;
+ struct pci_dev *dev = NULL;
u16 devid;
for_each_pci_dev(dev) {
@@ -2444,6 +2817,16 @@ static void prealloc_protection_domains(void)
if (!check_device(&dev->dev))
continue;
+ dev_data = get_dev_data(&dev->dev);
+ if (!amd_iommu_force_isolation && dev_data->iommu_v2) {
+ /* Make sure passthrough domain is allocated */
+ alloc_passthrough_domain();
+ dev_data->passthrough = true;
+ attach_device(&dev->dev, pt_domain);
+ pr_info("AMD-Vi: Using passthough domain for device %s\n",
+ dev_name(&dev->dev));
+ }
+
/* Is there already any domain for it? */
if (domain_for_device(&dev->dev))
continue;
@@ -2474,6 +2857,7 @@ static struct dma_map_ops amd_iommu_dma_ops = {
static unsigned device_dma_ops_init(void)
{
+ struct iommu_dev_data *dev_data;
struct pci_dev *pdev = NULL;
unsigned unhandled = 0;
@@ -2483,7 +2867,12 @@ static unsigned device_dma_ops_init(void)
continue;
}
- pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
+ dev_data = get_dev_data(&pdev->dev);
+
+ if (!dev_data->passthrough)
+ pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
+ else
+ pdev->dev.archdata.dma_ops = &nommu_dma_ops;
}
return unhandled;
@@ -2610,6 +2999,20 @@ out_err:
return NULL;
}
+static int __init alloc_passthrough_domain(void)
+{
+ if (pt_domain != NULL)
+ return 0;
+
+ /* allocate passthrough domain */
+ pt_domain = protection_domain_alloc();
+ if (!pt_domain)
+ return -ENOMEM;
+
+ pt_domain->mode = PAGE_MODE_NONE;
+
+ return 0;
+}
static int amd_iommu_domain_init(struct iommu_domain *dom)
{
struct protection_domain *domain;
@@ -2623,6 +3026,8 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
if (!domain->pt_root)
goto out_free;
+ domain->iommu_domain = dom;
+
dom->priv = domain;
return 0;
@@ -2645,7 +3050,11 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
BUG_ON(domain->dev_cnt != 0);
- free_pagetable(domain);
+ if (domain->mode != PAGE_MODE_NONE)
+ free_pagetable(domain);
+
+ if (domain->flags & PD_IOMMUV2_MASK)
+ free_gcr3_table(domain);
protection_domain_free(domain);
@@ -2702,13 +3111,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
}
static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, int gfp_order, int iommu_prot)
+ phys_addr_t paddr, size_t page_size, int iommu_prot)
{
- unsigned long page_size = 0x1000UL << gfp_order;
struct protection_domain *domain = dom->priv;
int prot = 0;
int ret;
+ if (domain->mode == PAGE_MODE_NONE)
+ return -EINVAL;
+
if (iommu_prot & IOMMU_READ)
prot |= IOMMU_PROT_IR;
if (iommu_prot & IOMMU_WRITE)
@@ -2721,13 +3132,14 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
return ret;
}
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
- int gfp_order)
+static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+ size_t page_size)
{
struct protection_domain *domain = dom->priv;
- unsigned long page_size, unmap_size;
+ size_t unmap_size;
- page_size = 0x1000UL << gfp_order;
+ if (domain->mode == PAGE_MODE_NONE)
+ return -EINVAL;
mutex_lock(&domain->api_lock);
unmap_size = iommu_unmap_page(domain, iova, page_size);
@@ -2735,7 +3147,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
domain_flush_tlb_pde(domain);
- return get_order(unmap_size);
+ return unmap_size;
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2746,6 +3158,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
phys_addr_t paddr;
u64 *pte, __pte;
+ if (domain->mode == PAGE_MODE_NONE)
+ return iova;
+
pte = fetch_pte(domain, iova);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
@@ -2782,6 +3197,7 @@ static struct iommu_ops amd_iommu_ops = {
.unmap = amd_iommu_unmap,
.iova_to_phys = amd_iommu_iova_to_phys,
.domain_has_cap = amd_iommu_domain_has_cap,
+ .pgsize_bitmap = AMD_IOMMU_PGSIZES,
};
/*****************************************************************************
@@ -2796,21 +3212,23 @@ static struct iommu_ops amd_iommu_ops = {
int __init amd_iommu_init_passthrough(void)
{
- struct amd_iommu *iommu;
+ struct iommu_dev_data *dev_data;
struct pci_dev *dev = NULL;
+ struct amd_iommu *iommu;
u16 devid;
+ int ret;
- /* allocate passthrough domain */
- pt_domain = protection_domain_alloc();
- if (!pt_domain)
- return -ENOMEM;
-
- pt_domain->mode |= PAGE_MODE_NONE;
+ ret = alloc_passthrough_domain();
+ if (ret)
+ return ret;
for_each_pci_dev(dev) {
if (!check_device(&dev->dev))
continue;
+ dev_data = get_dev_data(&dev->dev);
+ dev_data->passthrough = true;
+
devid = get_device_id(&dev->dev);
iommu = amd_iommu_rlookup_table[devid];
@@ -2820,7 +3238,375 @@ int __init amd_iommu_init_passthrough(void)
attach_device(&dev->dev, pt_domain);
}
+ amd_iommu_stats_init();
+
pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
return 0;
}
+
+/* IOMMUv2 specific functions */
+int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
+
+int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
+
+void amd_iommu_domain_direct_map(struct iommu_domain *dom)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ /* Update data structure */
+ domain->mode = PAGE_MODE_NONE;
+ domain->updated = true;
+
+ /* Make changes visible to IOMMUs */
+ update_domain(domain);
+
+ /* Page-table is not visible to IOMMU anymore, so free it */
+ free_pagetable(domain);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+EXPORT_SYMBOL(amd_iommu_domain_direct_map);
+
+int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long flags;
+ int levels, ret;
+
+ if (pasids <= 0 || pasids > (PASID_MASK + 1))
+ return -EINVAL;
+
+ /* Number of GCR3 table levels required */
+ for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
+ levels += 1;
+
+ if (levels > amd_iommu_max_glx_val)
+ return -EINVAL;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ /*
+ * Save us all sanity checks whether devices already in the
+ * domain support IOMMUv2. Just force that the domain has no
+ * devices attached when it is switched into IOMMUv2 mode.
+ */
+ ret = -EBUSY;
+ if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
+ goto out;
+
+ ret = -ENOMEM;
+ domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (domain->gcr3_tbl == NULL)
+ goto out;
+
+ domain->glx = levels;
+ domain->flags |= PD_IOMMUV2_MASK;
+ domain->updated = true;
+
+ update_domain(domain);
+
+ ret = 0;
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
+
+static int __flush_pasid(struct protection_domain *domain, int pasid,
+ u64 address, bool size)
+{
+ struct iommu_dev_data *dev_data;
+ struct iommu_cmd cmd;
+ int i, ret;
+
+ if (!(domain->flags & PD_IOMMUV2_MASK))
+ return -EINVAL;
+
+ build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
+
+ /*
+ * IOMMU TLB needs to be flushed before Device TLB to
+ * prevent device TLB refill from IOMMU TLB
+ */
+ for (i = 0; i < amd_iommus_present; ++i) {
+ if (domain->dev_iommu[i] == 0)
+ continue;
+
+ ret = iommu_queue_command(amd_iommus[i], &cmd);
+ if (ret != 0)
+ goto out;
+ }
+
+ /* Wait until IOMMU TLB flushes are complete */
+ domain_flush_complete(domain);
+
+ /* Now flush device TLBs */
+ list_for_each_entry(dev_data, &domain->dev_list, list) {
+ struct amd_iommu *iommu;
+ int qdep;
+
+ BUG_ON(!dev_data->ats.enabled);
+
+ qdep = dev_data->ats.qdep;
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+ build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
+ qdep, address, size);
+
+ ret = iommu_queue_command(iommu, &cmd);
+ if (ret != 0)
+ goto out;
+ }
+
+ /* Wait until all device TLBs are flushed */
+ domain_flush_complete(domain);
+
+ ret = 0;
+
+out:
+
+ return ret;
+}
+
+static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
+ u64 address)
+{
+ INC_STATS_COUNTER(invalidate_iotlb);
+
+ return __flush_pasid(domain, pasid, address, false);
+}
+
+int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+ u64 address)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __amd_iommu_flush_page(domain, pasid, address);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_page);
+
+static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
+{
+ INC_STATS_COUNTER(invalidate_iotlb_all);
+
+ return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ true);
+}
+
+int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __amd_iommu_flush_tlb(domain, pasid);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_tlb);
+
+static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
+{
+ int index;
+ u64 *pte;
+
+ while (true) {
+
+ index = (pasid >> (9 * level)) & 0x1ff;
+ pte = &root[index];
+
+ if (level == 0)
+ break;
+
+ if (!(*pte & GCR3_VALID)) {
+ if (!alloc)
+ return NULL;
+
+ root = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (root == NULL)
+ return NULL;
+
+ *pte = __pa(root) | GCR3_VALID;
+ }
+
+ root = __va(*pte & PAGE_MASK);
+
+ level -= 1;
+ }
+
+ return pte;
+}
+
+static int __set_gcr3(struct protection_domain *domain, int pasid,
+ unsigned long cr3)
+{
+ u64 *pte;
+
+ if (domain->mode != PAGE_MODE_NONE)
+ return -EINVAL;
+
+ pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
+ if (pte == NULL)
+ return -ENOMEM;
+
+ *pte = (cr3 & PAGE_MASK) | GCR3_VALID;
+
+ return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+static int __clear_gcr3(struct protection_domain *domain, int pasid)
+{
+ u64 *pte;
+
+ if (domain->mode != PAGE_MODE_NONE)
+ return -EINVAL;
+
+ pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
+ if (pte == NULL)
+ return 0;
+
+ *pte = 0;
+
+ return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+ unsigned long cr3)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __set_gcr3(domain, pasid, cr3);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
+
+int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ ret = __clear_gcr3(domain, pasid);
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
+
+int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+ int status, int tag)
+{
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu;
+ struct iommu_cmd cmd;
+
+ INC_STATS_COUNTER(complete_ppr);
+
+ dev_data = get_dev_data(&pdev->dev);
+ iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+ build_complete_ppr(&cmd, dev_data->devid, pasid, status,
+ tag, dev_data->pri_tlp);
+
+ return iommu_queue_command(iommu, &cmd);
+}
+EXPORT_SYMBOL(amd_iommu_complete_ppr);
+
+struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
+{
+ struct protection_domain *domain;
+
+ domain = get_domain(&pdev->dev);
+ if (IS_ERR(domain))
+ return NULL;
+
+ /* Only return IOMMUv2 domains */
+ if (!(domain->flags & PD_IOMMUV2_MASK))
+ return NULL;
+
+ return domain->iommu_domain;
+}
+EXPORT_SYMBOL(amd_iommu_get_v2_domain);
+
+void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
+{
+ struct iommu_dev_data *dev_data;
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ dev_data = get_dev_data(&pdev->dev);
+ dev_data->errata |= (1 << erratum);
+}
+EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
+
+int amd_iommu_device_info(struct pci_dev *pdev,
+ struct amd_iommu_device_info *info)
+{
+ int max_pasids;
+ int pos;
+
+ if (pdev == NULL || info == NULL)
+ return -EINVAL;
+
+ if (!amd_iommu_v2_supported())
+ return -EINVAL;
+
+ memset(info, 0, sizeof(*info));
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
+ if (pos)
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+ if (pos)
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
+ if (pos) {
+ int features;
+
+ max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
+ max_pasids = min(max_pasids, (1 << 20));
+
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+ info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
+
+ features = pci_pasid_features(pdev);
+ if (features & PCI_PASID_CAP_EXEC)
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
+ if (features & PCI_PASID_CAP_PRIV)
+ info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(amd_iommu_device_info);
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 82d2410f420..bdea288dc18 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -25,6 +25,7 @@
#include <linux/interrupt.h>
#include <linux/msi.h>
#include <linux/amd-iommu.h>
+#include <linux/export.h>
#include <asm/pci-direct.h>
#include <asm/iommu.h>
#include <asm/gart.h>
@@ -141,6 +142,12 @@ int amd_iommus_present;
bool amd_iommu_np_cache __read_mostly;
bool amd_iommu_iotlb_sup __read_mostly = true;
+u32 amd_iommu_max_pasids __read_mostly = ~0;
+
+bool amd_iommu_v2_present __read_mostly;
+
+bool amd_iommu_force_isolation __read_mostly;
+
/*
* The ACPI table parsing functions set this variable on an error
*/
@@ -299,6 +306,16 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
}
+static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout)
+{
+ u32 ctrl;
+
+ ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl &= ~CTRL_INV_TO_MASK;
+ ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK;
+ writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
/* Function to enable the hardware */
static void iommu_enable(struct amd_iommu *iommu)
{
@@ -581,21 +598,69 @@ static void __init free_event_buffer(struct amd_iommu *iommu)
free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
}
+/* allocates the memory where the IOMMU will log its events to */
+static u8 * __init alloc_ppr_log(struct amd_iommu *iommu)
+{
+ iommu->ppr_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(PPR_LOG_SIZE));
+
+ if (iommu->ppr_log == NULL)
+ return NULL;
+
+ return iommu->ppr_log;
+}
+
+static void iommu_enable_ppr_log(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ if (iommu->ppr_log == NULL)
+ return;
+
+ entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
+
+ memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET,
+ &entry, sizeof(entry));
+
+ /* set head and tail to zero manually */
+ writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+
+ iommu_feature_enable(iommu, CONTROL_PPFLOG_EN);
+ iommu_feature_enable(iommu, CONTROL_PPR_EN);
+}
+
+static void __init free_ppr_log(struct amd_iommu *iommu)
+{
+ if (iommu->ppr_log == NULL)
+ return;
+
+ free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE));
+}
+
+static void iommu_enable_gt(struct amd_iommu *iommu)
+{
+ if (!iommu_feature(iommu, FEATURE_GT))
+ return;
+
+ iommu_feature_enable(iommu, CONTROL_GT_EN);
+}
+
/* sets a specific bit in the device table entry. */
static void set_dev_entry_bit(u16 devid, u8 bit)
{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
+ int i = (bit >> 6) & 0x03;
+ int _bit = bit & 0x3f;
- amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
+ amd_iommu_dev_table[devid].data[i] |= (1UL << _bit);
}
static int get_dev_entry_bit(u16 devid, u8 bit)
{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
+ int i = (bit >> 6) & 0x03;
+ int _bit = bit & 0x3f;
- return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
+ return (amd_iommu_dev_table[devid].data[i] & (1UL << _bit)) >> _bit;
}
@@ -699,6 +764,32 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
iommu->features = ((u64)high << 32) | low;
+ if (iommu_feature(iommu, FEATURE_GT)) {
+ int glxval;
+ u32 pasids;
+ u64 shift;
+
+ shift = iommu->features & FEATURE_PASID_MASK;
+ shift >>= FEATURE_PASID_SHIFT;
+ pasids = (1 << shift);
+
+ amd_iommu_max_pasids = min(amd_iommu_max_pasids, pasids);
+
+ glxval = iommu->features & FEATURE_GLXVAL_MASK;
+ glxval >>= FEATURE_GLXVAL_SHIFT;
+
+ if (amd_iommu_max_glx_val == -1)
+ amd_iommu_max_glx_val = glxval;
+ else
+ amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval);
+ }
+
+ if (iommu_feature(iommu, FEATURE_GT) &&
+ iommu_feature(iommu, FEATURE_PPR)) {
+ iommu->is_iommu_v2 = true;
+ amd_iommu_v2_present = true;
+ }
+
if (!is_rd890_iommu(iommu->dev))
return;
@@ -901,6 +992,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu)
{
free_command_buffer(iommu);
free_event_buffer(iommu);
+ free_ppr_log(iommu);
iommu_unmap_mmio_space(iommu);
}
@@ -964,6 +1056,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
init_iommu_from_acpi(iommu, h);
init_iommu_devices(iommu);
+ if (iommu_feature(iommu, FEATURE_PPR)) {
+ iommu->ppr_log = alloc_ppr_log(iommu);
+ if (!iommu->ppr_log)
+ return -ENOMEM;
+ }
+
if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
amd_iommu_np_cache = true;
@@ -1050,6 +1148,9 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
iommu->int_enabled = true;
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+ if (iommu->ppr_log != NULL)
+ iommu_feature_enable(iommu, CONTROL_PPFINT_EN);
+
return 0;
}
@@ -1209,6 +1310,9 @@ static void iommu_init_flags(struct amd_iommu *iommu)
* make IOMMU memory accesses cache coherent
*/
iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+ /* Set IOTLB invalidation timeout to 1s */
+ iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S);
}
static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
@@ -1274,6 +1378,8 @@ static void enable_iommus(void)
iommu_set_device_table(iommu);
iommu_enable_command_buffer(iommu);
iommu_enable_event_buffer(iommu);
+ iommu_enable_ppr_log(iommu);
+ iommu_enable_gt(iommu);
iommu_set_exclusion_range(iommu);
iommu_init_msi(iommu);
iommu_enable(iommu);
@@ -1303,13 +1409,6 @@ static void amd_iommu_resume(void)
/* re-load the hardware */
enable_iommus();
-
- /*
- * we have to flush after the IOMMUs are enabled because a
- * disabled IOMMU will never execute the commands we send
- */
- for_each_iommu(iommu)
- iommu_flush_all_caches(iommu);
}
static int amd_iommu_suspend(void)
@@ -1560,6 +1659,8 @@ static int __init parse_amd_iommu_options(char *str)
amd_iommu_unmap_flush = true;
if (strncmp(str, "off", 3) == 0)
amd_iommu_disabled = true;
+ if (strncmp(str, "force_isolation", 15) == 0)
+ amd_iommu_force_isolation = true;
}
return 1;
@@ -1572,3 +1673,9 @@ IOMMU_INIT_FINISH(amd_iommu_detect,
gart_iommu_hole_init,
0,
0);
+
+bool amd_iommu_v2_supported(void)
+{
+ return amd_iommu_v2_present;
+}
+EXPORT_SYMBOL(amd_iommu_v2_supported);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 7ffaa64410b..1a7f41c6cc6 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -31,6 +31,30 @@ extern int amd_iommu_init_devices(void);
extern void amd_iommu_uninit_devices(void);
extern void amd_iommu_init_notifier(void);
extern void amd_iommu_init_api(void);
+
+/* IOMMUv2 specific functions */
+struct iommu_domain;
+
+extern bool amd_iommu_v2_supported(void);
+extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb);
+extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb);
+extern void amd_iommu_domain_direct_map(struct iommu_domain *dom);
+extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids);
+extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+ u64 address);
+extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid);
+extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+ unsigned long cr3);
+extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid);
+extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
+
+#define PPR_SUCCESS 0x0
+#define PPR_INVALID 0x1
+#define PPR_FAILURE 0xf
+
+extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+ int status, int tag);
+
#ifndef CONFIG_AMD_IOMMU_STATS
static inline void amd_iommu_stats_init(void) { }
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 5b9c5075e81..2452f3b7173 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -69,11 +69,14 @@
#define MMIO_EXCL_BASE_OFFSET 0x0020
#define MMIO_EXCL_LIMIT_OFFSET 0x0028
#define MMIO_EXT_FEATURES 0x0030
+#define MMIO_PPR_LOG_OFFSET 0x0038
#define MMIO_CMD_HEAD_OFFSET 0x2000
#define MMIO_CMD_TAIL_OFFSET 0x2008
#define MMIO_EVT_HEAD_OFFSET 0x2010
#define MMIO_EVT_TAIL_OFFSET 0x2018
#define MMIO_STATUS_OFFSET 0x2020
+#define MMIO_PPR_HEAD_OFFSET 0x2030
+#define MMIO_PPR_TAIL_OFFSET 0x2038
/* Extended Feature Bits */
@@ -87,8 +90,17 @@
#define FEATURE_HE (1ULL<<8)
#define FEATURE_PC (1ULL<<9)
+#define FEATURE_PASID_SHIFT 32
+#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT)
+
+#define FEATURE_GLXVAL_SHIFT 14
+#define FEATURE_GLXVAL_MASK (0x03ULL << FEATURE_GLXVAL_SHIFT)
+
+#define PASID_MASK 0x000fffff
+
/* MMIO status bits */
-#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
+#define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2)
+#define MMIO_STATUS_PPR_INT_MASK (1 << 6)
/* event logging constants */
#define EVENT_ENTRY_SIZE 0x10
@@ -115,6 +127,7 @@
#define CONTROL_EVT_LOG_EN 0x02ULL
#define CONTROL_EVT_INT_EN 0x03ULL
#define CONTROL_COMWAIT_EN 0x04ULL
+#define CONTROL_INV_TIMEOUT 0x05ULL
#define CONTROL_PASSPW_EN 0x08ULL
#define CONTROL_RESPASSPW_EN 0x09ULL
#define CONTROL_COHERENT_EN 0x0aULL
@@ -122,18 +135,34 @@
#define CONTROL_CMDBUF_EN 0x0cULL
#define CONTROL_PPFLOG_EN 0x0dULL
#define CONTROL_PPFINT_EN 0x0eULL
+#define CONTROL_PPR_EN 0x0fULL
+#define CONTROL_GT_EN 0x10ULL
+
+#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT)
+#define CTRL_INV_TO_NONE 0
+#define CTRL_INV_TO_1MS 1
+#define CTRL_INV_TO_10MS 2
+#define CTRL_INV_TO_100MS 3
+#define CTRL_INV_TO_1S 4
+#define CTRL_INV_TO_10S 5
+#define CTRL_INV_TO_100S 6
/* command specific defines */
#define CMD_COMPL_WAIT 0x01
#define CMD_INV_DEV_ENTRY 0x02
#define CMD_INV_IOMMU_PAGES 0x03
#define CMD_INV_IOTLB_PAGES 0x04
+#define CMD_COMPLETE_PPR 0x07
#define CMD_INV_ALL 0x08
#define CMD_COMPL_WAIT_STORE_MASK 0x01
#define CMD_COMPL_WAIT_INT_MASK 0x02
#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
+#define CMD_INV_IOMMU_PAGES_GN_MASK 0x04
+
+#define PPR_STATUS_MASK 0xf
+#define PPR_STATUS_SHIFT 12
#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
@@ -165,6 +194,23 @@
#define EVT_BUFFER_SIZE 8192 /* 512 entries */
#define EVT_LEN_MASK (0x9ULL << 56)
+/* Constants for PPR Log handling */
+#define PPR_LOG_ENTRIES 512
+#define PPR_LOG_SIZE_SHIFT 56
+#define PPR_LOG_SIZE_512 (0x9ULL << PPR_LOG_SIZE_SHIFT)
+#define PPR_ENTRY_SIZE 16
+#define PPR_LOG_SIZE (PPR_ENTRY_SIZE * PPR_LOG_ENTRIES)
+
+#define PPR_REQ_TYPE(x) (((x) >> 60) & 0xfULL)
+#define PPR_FLAGS(x) (((x) >> 48) & 0xfffULL)
+#define PPR_DEVID(x) ((x) & 0xffffULL)
+#define PPR_TAG(x) (((x) >> 32) & 0x3ffULL)
+#define PPR_PASID1(x) (((x) >> 16) & 0xffffULL)
+#define PPR_PASID2(x) (((x) >> 42) & 0xfULL)
+#define PPR_PASID(x) ((PPR_PASID2(x) << 16) | PPR_PASID1(x))
+
+#define PPR_REQ_FAULT 0x01
+
#define PAGE_MODE_NONE 0x00
#define PAGE_MODE_1_LEVEL 0x01
#define PAGE_MODE_2_LEVEL 0x02
@@ -230,7 +276,24 @@
#define IOMMU_PTE_IR (1ULL << 61)
#define IOMMU_PTE_IW (1ULL << 62)
-#define DTE_FLAG_IOTLB 0x01
+#define DTE_FLAG_IOTLB (0x01UL << 32)
+#define DTE_FLAG_GV (0x01ULL << 55)
+#define DTE_GLX_SHIFT (56)
+#define DTE_GLX_MASK (3)
+
+#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL)
+#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL)
+#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0xfffffULL)
+
+#define DTE_GCR3_INDEX_A 0
+#define DTE_GCR3_INDEX_B 1
+#define DTE_GCR3_INDEX_C 1
+
+#define DTE_GCR3_SHIFT_A 58
+#define DTE_GCR3_SHIFT_B 16
+#define DTE_GCR3_SHIFT_C 43
+
+#define GCR3_VALID 0x01ULL
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
@@ -257,6 +320,7 @@
domain for an IOMMU */
#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
translation */
+#define PD_IOMMUV2_MASK (1UL << 3) /* domain has gcr3 table */
extern bool amd_iommu_dump;
#define DUMP_printk(format, arg...) \
@@ -285,6 +349,29 @@ extern bool amd_iommu_iotlb_sup;
#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
+
+/*
+ * This struct is used to pass information about
+ * incoming PPR faults around.
+ */
+struct amd_iommu_fault {
+ u64 address; /* IO virtual address of the fault*/
+ u32 pasid; /* Address space identifier */
+ u16 device_id; /* Originating PCI device id */
+ u16 tag; /* PPR tag */
+ u16 flags; /* Fault flags */
+
+};
+
+#define PPR_FAULT_EXEC (1 << 1)
+#define PPR_FAULT_READ (1 << 2)
+#define PPR_FAULT_WRITE (1 << 5)
+#define PPR_FAULT_USER (1 << 6)
+#define PPR_FAULT_RSVD (1 << 7)
+#define PPR_FAULT_GN (1 << 8)
+
+struct iommu_domain;
+
/*
* This structure contains generic data for IOMMU protection domains
* independent of their use.
@@ -297,11 +384,15 @@ struct protection_domain {
u16 id; /* the domain id written to the device table */
int mode; /* paging mode (0-6 levels) */
u64 *pt_root; /* page table root pointer */
+ int glx; /* Number of levels for GCR3 table */
+ u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
bool updated; /* complete domain flush required */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
void *priv; /* private data */
+ struct iommu_domain *iommu_domain; /* Pointer to generic
+ domain structure */
};
@@ -315,10 +406,15 @@ struct iommu_dev_data {
struct protection_domain *domain; /* Domain the device is bound to */
atomic_t bind; /* Domain attach reverent count */
u16 devid; /* PCI Device ID */
+ bool iommu_v2; /* Device can make use of IOMMUv2 */
+ bool passthrough; /* Default for device is pt_domain */
struct {
bool enabled;
int qdep;
} ats; /* ATS state */
+ bool pri_tlp; /* PASID TLB required for
+ PPR completions */
+ u32 errata; /* Bitmap for errata to apply */
};
/*
@@ -399,6 +495,9 @@ struct amd_iommu {
/* Extended features */
u64 features;
+ /* IOMMUv2 */
+ bool is_iommu_v2;
+
/*
* Capability pointer. There could be more than one IOMMU per PCI
* device function if there are more than one AMD IOMMU capability
@@ -431,6 +530,9 @@ struct amd_iommu {
/* MSI number for event interrupt */
u16 evt_msi_num;
+ /* Base of the PPR log, if present */
+ u8 *ppr_log;
+
/* true if interrupts for this IOMMU are already enabled */
bool int_enabled;
@@ -484,7 +586,7 @@ extern struct list_head amd_iommu_pd_list;
* Structure defining one entry in the device table
*/
struct dev_table_entry {
- u32 data[8];
+ u64 data[4];
};
/*
@@ -549,6 +651,16 @@ extern unsigned long *amd_iommu_pd_alloc_bitmap;
*/
extern bool amd_iommu_unmap_flush;
+/* Smallest number of PASIDs supported by any IOMMU in the system */
+extern u32 amd_iommu_max_pasids;
+
+extern bool amd_iommu_v2_present;
+
+extern bool amd_iommu_force_isolation;
+
+/* Max levels of glxval supported */
+extern int amd_iommu_max_glx_val;
+
/* takes bus and device/function and returns the device id
* FIXME: should that be in generic PCI code? */
static inline u16 calc_devid(u8 bus, u8 devfn)
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
new file mode 100644
index 00000000000..8add9f125d3
--- /dev/null
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -0,0 +1,994 @@
+/*
+ * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/amd-iommu.h>
+#include <linux/mm_types.h>
+#include <linux/profile.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/iommu.h>
+#include <linux/wait.h>
+#include <linux/pci.h>
+#include <linux/gfp.h>
+
+#include "amd_iommu_types.h"
+#include "amd_iommu_proto.h"
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joerg Roedel <joerg.roedel@amd.com>");
+
+#define MAX_DEVICES 0x10000
+#define PRI_QUEUE_SIZE 512
+
+struct pri_queue {
+ atomic_t inflight;
+ bool finish;
+ int status;
+};
+
+struct pasid_state {
+ struct list_head list; /* For global state-list */
+ atomic_t count; /* Reference count */
+ struct task_struct *task; /* Task bound to this PASID */
+ struct mm_struct *mm; /* mm_struct for the faults */
+ struct mmu_notifier mn; /* mmu_otifier handle */
+ struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */
+ struct device_state *device_state; /* Link to our device_state */
+ int pasid; /* PASID index */
+ spinlock_t lock; /* Protect pri_queues */
+ wait_queue_head_t wq; /* To wait for count == 0 */
+};
+
+struct device_state {
+ atomic_t count;
+ struct pci_dev *pdev;
+ struct pasid_state **states;
+ struct iommu_domain *domain;
+ int pasid_levels;
+ int max_pasids;
+ amd_iommu_invalid_ppr_cb inv_ppr_cb;
+ amd_iommu_invalidate_ctx inv_ctx_cb;
+ spinlock_t lock;
+ wait_queue_head_t wq;
+};
+
+struct fault {
+ struct work_struct work;
+ struct device_state *dev_state;
+ struct pasid_state *state;
+ struct mm_struct *mm;
+ u64 address;
+ u16 devid;
+ u16 pasid;
+ u16 tag;
+ u16 finish;
+ u16 flags;
+};
+
+struct device_state **state_table;
+static spinlock_t state_lock;
+
+/* List and lock for all pasid_states */
+static LIST_HEAD(pasid_state_list);
+static DEFINE_SPINLOCK(ps_lock);
+
+static struct workqueue_struct *iommu_wq;
+
+/*
+ * Empty page table - Used between
+ * mmu_notifier_invalidate_range_start and
+ * mmu_notifier_invalidate_range_end
+ */
+static u64 *empty_page_table;
+
+static void free_pasid_states(struct device_state *dev_state);
+static void unbind_pasid(struct device_state *dev_state, int pasid);
+static int task_exit(struct notifier_block *nb, unsigned long e, void *data);
+
+static u16 device_id(struct pci_dev *pdev)
+{
+ u16 devid;
+
+ devid = pdev->bus->number;
+ devid = (devid << 8) | pdev->devfn;
+
+ return devid;
+}
+
+static struct device_state *get_device_state(u16 devid)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ dev_state = state_table[devid];
+ if (dev_state != NULL)
+ atomic_inc(&dev_state->count);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return dev_state;
+}
+
+static void free_device_state(struct device_state *dev_state)
+{
+ /*
+ * First detach device from domain - No more PRI requests will arrive
+ * from that device after it is unbound from the IOMMUv2 domain.
+ */
+ iommu_detach_device(dev_state->domain, &dev_state->pdev->dev);
+
+ /* Everything is down now, free the IOMMUv2 domain */
+ iommu_domain_free(dev_state->domain);
+
+ /* Finally get rid of the device-state */
+ kfree(dev_state);
+}
+
+static void put_device_state(struct device_state *dev_state)
+{
+ if (atomic_dec_and_test(&dev_state->count))
+ wake_up(&dev_state->wq);
+}
+
+static void put_device_state_wait(struct device_state *dev_state)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&dev_state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (!atomic_dec_and_test(&dev_state->count))
+ schedule();
+ finish_wait(&dev_state->wq, &wait);
+
+ free_device_state(dev_state);
+}
+
+static struct notifier_block profile_nb = {
+ .notifier_call = task_exit,
+};
+
+static void link_pasid_state(struct pasid_state *pasid_state)
+{
+ spin_lock(&ps_lock);
+ list_add_tail(&pasid_state->list, &pasid_state_list);
+ spin_unlock(&ps_lock);
+}
+
+static void __unlink_pasid_state(struct pasid_state *pasid_state)
+{
+ list_del(&pasid_state->list);
+}
+
+static void unlink_pasid_state(struct pasid_state *pasid_state)
+{
+ spin_lock(&ps_lock);
+ __unlink_pasid_state(pasid_state);
+ spin_unlock(&ps_lock);
+}
+
+/* Must be called under dev_state->lock */
+static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
+ int pasid, bool alloc)
+{
+ struct pasid_state **root, **ptr;
+ int level, index;
+
+ level = dev_state->pasid_levels;
+ root = dev_state->states;
+
+ while (true) {
+
+ index = (pasid >> (9 * level)) & 0x1ff;
+ ptr = &root[index];
+
+ if (level == 0)
+ break;
+
+ if (*ptr == NULL) {
+ if (!alloc)
+ return NULL;
+
+ *ptr = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (*ptr == NULL)
+ return NULL;
+ }
+
+ root = (struct pasid_state **)*ptr;
+ level -= 1;
+ }
+
+ return ptr;
+}
+
+static int set_pasid_state(struct device_state *dev_state,
+ struct pasid_state *pasid_state,
+ int pasid)
+{
+ struct pasid_state **ptr;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+ ret = -ENOMEM;
+ if (ptr == NULL)
+ goto out_unlock;
+
+ ret = -ENOMEM;
+ if (*ptr != NULL)
+ goto out_unlock;
+
+ *ptr = pasid_state;
+
+ ret = 0;
+
+out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+
+ return ret;
+}
+
+static void clear_pasid_state(struct device_state *dev_state, int pasid)
+{
+ struct pasid_state **ptr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+
+ if (ptr == NULL)
+ goto out_unlock;
+
+ *ptr = NULL;
+
+out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+}
+
+static struct pasid_state *get_pasid_state(struct device_state *dev_state,
+ int pasid)
+{
+ struct pasid_state **ptr, *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_state->lock, flags);
+ ptr = __get_pasid_state_ptr(dev_state, pasid, false);
+
+ if (ptr == NULL)
+ goto out_unlock;
+
+ ret = *ptr;
+ if (ret)
+ atomic_inc(&ret->count);
+
+out_unlock:
+ spin_unlock_irqrestore(&dev_state->lock, flags);
+
+ return ret;
+}
+
+static void free_pasid_state(struct pasid_state *pasid_state)
+{
+ kfree(pasid_state);
+}
+
+static void put_pasid_state(struct pasid_state *pasid_state)
+{
+ if (atomic_dec_and_test(&pasid_state->count)) {
+ put_device_state(pasid_state->device_state);
+ wake_up(&pasid_state->wq);
+ }
+}
+
+static void put_pasid_state_wait(struct pasid_state *pasid_state)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&pasid_state->wq, &wait, TASK_UNINTERRUPTIBLE);
+
+ if (atomic_dec_and_test(&pasid_state->count))
+ put_device_state(pasid_state->device_state);
+ else
+ schedule();
+
+ finish_wait(&pasid_state->wq, &wait);
+ mmput(pasid_state->mm);
+ free_pasid_state(pasid_state);
+}
+
+static void __unbind_pasid(struct pasid_state *pasid_state)
+{
+ struct iommu_domain *domain;
+
+ domain = pasid_state->device_state->domain;
+
+ amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid);
+ clear_pasid_state(pasid_state->device_state, pasid_state->pasid);
+
+ /* Make sure no more pending faults are in the queue */
+ flush_workqueue(iommu_wq);
+
+ mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+ put_pasid_state(pasid_state); /* Reference taken in bind() function */
+}
+
+static void unbind_pasid(struct device_state *dev_state, int pasid)
+{
+ struct pasid_state *pasid_state;
+
+ pasid_state = get_pasid_state(dev_state, pasid);
+ if (pasid_state == NULL)
+ return;
+
+ unlink_pasid_state(pasid_state);
+ __unbind_pasid(pasid_state);
+ put_pasid_state_wait(pasid_state); /* Reference taken in this function */
+}
+
+static void free_pasid_states_level1(struct pasid_state **tbl)
+{
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (tbl[i] == NULL)
+ continue;
+
+ free_page((unsigned long)tbl[i]);
+ }
+}
+
+static void free_pasid_states_level2(struct pasid_state **tbl)
+{
+ struct pasid_state **ptr;
+ int i;
+
+ for (i = 0; i < 512; ++i) {
+ if (tbl[i] == NULL)
+ continue;
+
+ ptr = (struct pasid_state **)tbl[i];
+ free_pasid_states_level1(ptr);
+ }
+}
+
+static void free_pasid_states(struct device_state *dev_state)
+{
+ struct pasid_state *pasid_state;
+ int i;
+
+ for (i = 0; i < dev_state->max_pasids; ++i) {
+ pasid_state = get_pasid_state(dev_state, i);
+ if (pasid_state == NULL)
+ continue;
+
+ put_pasid_state(pasid_state);
+ unbind_pasid(dev_state, i);
+ }
+
+ if (dev_state->pasid_levels == 2)
+ free_pasid_states_level2(dev_state->states);
+ else if (dev_state->pasid_levels == 1)
+ free_pasid_states_level1(dev_state->states);
+ else if (dev_state->pasid_levels != 0)
+ BUG();
+
+ free_page((unsigned long)dev_state->states);
+}
+
+static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
+{
+ return container_of(mn, struct pasid_state, mn);
+}
+
+static void __mn_flush_page(struct mmu_notifier *mn,
+ unsigned long address)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+
+ pasid_state = mn_to_state(mn);
+ dev_state = pasid_state->device_state;
+
+ amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, address);
+}
+
+static int mn_clear_flush_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ __mn_flush_page(mn, address);
+
+ return 0;
+}
+
+static void mn_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ __mn_flush_page(mn, address);
+}
+
+static void mn_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ __mn_flush_page(mn, address);
+}
+
+static void mn_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+
+ pasid_state = mn_to_state(mn);
+ dev_state = pasid_state->device_state;
+
+ amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid,
+ __pa(empty_page_table));
+}
+
+static void mn_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+
+ pasid_state = mn_to_state(mn);
+ dev_state = pasid_state->device_state;
+
+ amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid,
+ __pa(pasid_state->mm->pgd));
+}
+
+static struct mmu_notifier_ops iommu_mn = {
+ .clear_flush_young = mn_clear_flush_young,
+ .change_pte = mn_change_pte,
+ .invalidate_page = mn_invalidate_page,
+ .invalidate_range_start = mn_invalidate_range_start,
+ .invalidate_range_end = mn_invalidate_range_end,
+};
+
+static void set_pri_tag_status(struct pasid_state *pasid_state,
+ u16 tag, int status)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ pasid_state->pri[tag].status = status;
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+}
+
+static void finish_pri_tag(struct device_state *dev_state,
+ struct pasid_state *pasid_state,
+ u16 tag)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
+ pasid_state->pri[tag].finish) {
+ amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
+ pasid_state->pri[tag].status, tag);
+ pasid_state->pri[tag].finish = false;
+ pasid_state->pri[tag].status = PPR_SUCCESS;
+ }
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+}
+
+static void do_fault(struct work_struct *work)
+{
+ struct fault *fault = container_of(work, struct fault, work);
+ int npages, write;
+ struct page *page;
+
+ write = !!(fault->flags & PPR_FAULT_WRITE);
+
+ npages = get_user_pages(fault->state->task, fault->state->mm,
+ fault->address, 1, write, 0, &page, NULL);
+
+ if (npages == 1) {
+ put_page(page);
+ } else if (fault->dev_state->inv_ppr_cb) {
+ int status;
+
+ status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev,
+ fault->pasid,
+ fault->address,
+ fault->flags);
+ switch (status) {
+ case AMD_IOMMU_INV_PRI_RSP_SUCCESS:
+ set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS);
+ break;
+ case AMD_IOMMU_INV_PRI_RSP_INVALID:
+ set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+ break;
+ case AMD_IOMMU_INV_PRI_RSP_FAIL:
+ set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
+ break;
+ default:
+ BUG();
+ }
+ } else {
+ set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+ }
+
+ finish_pri_tag(fault->dev_state, fault->state, fault->tag);
+
+ put_pasid_state(fault->state);
+
+ kfree(fault);
+}
+
+static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
+{
+ struct amd_iommu_fault *iommu_fault;
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ unsigned long flags;
+ struct fault *fault;
+ bool finish;
+ u16 tag;
+ int ret;
+
+ iommu_fault = data;
+ tag = iommu_fault->tag & 0x1ff;
+ finish = (iommu_fault->tag >> 9) & 1;
+
+ ret = NOTIFY_DONE;
+ dev_state = get_device_state(iommu_fault->device_id);
+ if (dev_state == NULL)
+ goto out;
+
+ pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
+ if (pasid_state == NULL) {
+ /* We know the device but not the PASID -> send INVALID */
+ amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
+ PPR_INVALID, tag);
+ goto out_drop_state;
+ }
+
+ spin_lock_irqsave(&pasid_state->lock, flags);
+ atomic_inc(&pasid_state->pri[tag].inflight);
+ if (finish)
+ pasid_state->pri[tag].finish = true;
+ spin_unlock_irqrestore(&pasid_state->lock, flags);
+
+ fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
+ if (fault == NULL) {
+ /* We are OOM - send success and let the device re-fault */
+ finish_pri_tag(dev_state, pasid_state, tag);
+ goto out_drop_state;
+ }
+
+ fault->dev_state = dev_state;
+ fault->address = iommu_fault->address;
+ fault->state = pasid_state;
+ fault->tag = tag;
+ fault->finish = finish;
+ fault->flags = iommu_fault->flags;
+ INIT_WORK(&fault->work, do_fault);
+
+ queue_work(iommu_wq, &fault->work);
+
+ ret = NOTIFY_OK;
+
+out_drop_state:
+ put_device_state(dev_state);
+
+out:
+ return ret;
+}
+
+static struct notifier_block ppr_nb = {
+ .notifier_call = ppr_notifier,
+};
+
+static int task_exit(struct notifier_block *nb, unsigned long e, void *data)
+{
+ struct pasid_state *pasid_state;
+ struct task_struct *task;
+
+ task = data;
+
+ /*
+ * Using this notifier is a hack - but there is no other choice
+ * at the moment. What I really want is a sleeping notifier that
+ * is called when an MM goes down. But such a notifier doesn't
+ * exist yet. The notifier needs to sleep because it has to make
+ * sure that the device does not use the PASID and the address
+ * space anymore before it is destroyed. This includes waiting
+ * for pending PRI requests to pass the workqueue. The
+ * MMU-Notifiers would be a good fit, but they use RCU and so
+ * they are not allowed to sleep. Lets see how we can solve this
+ * in a more intelligent way in the future.
+ */
+again:
+ spin_lock(&ps_lock);
+ list_for_each_entry(pasid_state, &pasid_state_list, list) {
+ struct device_state *dev_state;
+ int pasid;
+
+ if (pasid_state->task != task)
+ continue;
+
+ /* Drop Lock and unbind */
+ spin_unlock(&ps_lock);
+
+ dev_state = pasid_state->device_state;
+ pasid = pasid_state->pasid;
+
+ if (pasid_state->device_state->inv_ctx_cb)
+ dev_state->inv_ctx_cb(dev_state->pdev, pasid);
+
+ unbind_pasid(dev_state, pasid);
+
+ /* Task may be in the list multiple times */
+ goto again;
+ }
+ spin_unlock(&ps_lock);
+
+ return NOTIFY_OK;
+}
+
+int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+ struct task_struct *task)
+{
+ struct pasid_state *pasid_state;
+ struct device_state *dev_state;
+ u16 devid;
+ int ret;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+ dev_state = get_device_state(devid);
+
+ if (dev_state == NULL)
+ return -EINVAL;
+
+ ret = -EINVAL;
+ if (pasid < 0 || pasid >= dev_state->max_pasids)
+ goto out;
+
+ ret = -ENOMEM;
+ pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL);
+ if (pasid_state == NULL)
+ goto out;
+
+ atomic_set(&pasid_state->count, 1);
+ init_waitqueue_head(&pasid_state->wq);
+ pasid_state->task = task;
+ pasid_state->mm = get_task_mm(task);
+ pasid_state->device_state = dev_state;
+ pasid_state->pasid = pasid;
+ pasid_state->mn.ops = &iommu_mn;
+
+ if (pasid_state->mm == NULL)
+ goto out_free;
+
+ mmu_notifier_register(&pasid_state->mn, pasid_state->mm);
+
+ ret = set_pasid_state(dev_state, pasid_state, pasid);
+ if (ret)
+ goto out_unregister;
+
+ ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
+ __pa(pasid_state->mm->pgd));
+ if (ret)
+ goto out_clear_state;
+
+ link_pasid_state(pasid_state);
+
+ return 0;
+
+out_clear_state:
+ clear_pasid_state(dev_state, pasid);
+
+out_unregister:
+ mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+
+out_free:
+ free_pasid_state(pasid_state);
+
+out:
+ put_device_state(dev_state);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_bind_pasid);
+
+void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
+{
+ struct device_state *dev_state;
+ u16 devid;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ devid = device_id(pdev);
+ dev_state = get_device_state(devid);
+ if (dev_state == NULL)
+ return;
+
+ if (pasid < 0 || pasid >= dev_state->max_pasids)
+ goto out;
+
+ unbind_pasid(dev_state, pasid);
+
+out:
+ put_device_state(dev_state);
+}
+EXPORT_SYMBOL(amd_iommu_unbind_pasid);
+
+int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+ int ret, tmp;
+ u16 devid;
+
+ might_sleep();
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ if (pasids <= 0 || pasids > (PASID_MASK + 1))
+ return -EINVAL;
+
+ devid = device_id(pdev);
+
+ dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL);
+ if (dev_state == NULL)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_state->lock);
+ init_waitqueue_head(&dev_state->wq);
+ dev_state->pdev = pdev;
+
+ tmp = pasids;
+ for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9)
+ dev_state->pasid_levels += 1;
+
+ atomic_set(&dev_state->count, 1);
+ dev_state->max_pasids = pasids;
+
+ ret = -ENOMEM;
+ dev_state->states = (void *)get_zeroed_page(GFP_KERNEL);
+ if (dev_state->states == NULL)
+ goto out_free_dev_state;
+
+ dev_state->domain = iommu_domain_alloc(&pci_bus_type);
+ if (dev_state->domain == NULL)
+ goto out_free_states;
+
+ amd_iommu_domain_direct_map(dev_state->domain);
+
+ ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids);
+ if (ret)
+ goto out_free_domain;
+
+ ret = iommu_attach_device(dev_state->domain, &pdev->dev);
+ if (ret != 0)
+ goto out_free_domain;
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ if (state_table[devid] != NULL) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ ret = -EBUSY;
+ goto out_free_domain;
+ }
+
+ state_table[devid] = dev_state;
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return 0;
+
+out_free_domain:
+ iommu_domain_free(dev_state->domain);
+
+out_free_states:
+ free_page((unsigned long)dev_state->states);
+
+out_free_dev_state:
+ kfree(dev_state);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_init_device);
+
+void amd_iommu_free_device(struct pci_dev *pdev)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+
+ if (!amd_iommu_v2_supported())
+ return;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ dev_state = state_table[devid];
+ if (dev_state == NULL) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ return;
+ }
+
+ state_table[devid] = NULL;
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ /* Get rid of any remaining pasid states */
+ free_pasid_states(dev_state);
+
+ put_device_state_wait(dev_state);
+}
+EXPORT_SYMBOL(amd_iommu_free_device);
+
+int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
+ amd_iommu_invalid_ppr_cb cb)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+ int ret;
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ ret = -EINVAL;
+ dev_state = state_table[devid];
+ if (dev_state == NULL)
+ goto out_unlock;
+
+ dev_state->inv_ppr_cb = cb;
+
+ ret = 0;
+
+out_unlock:
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb);
+
+int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
+ amd_iommu_invalidate_ctx cb)
+{
+ struct device_state *dev_state;
+ unsigned long flags;
+ u16 devid;
+ int ret;
+
+ if (!amd_iommu_v2_supported())
+ return -ENODEV;
+
+ devid = device_id(pdev);
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ ret = -EINVAL;
+ dev_state = state_table[devid];
+ if (dev_state == NULL)
+ goto out_unlock;
+
+ dev_state->inv_ctx_cb = cb;
+
+ ret = 0;
+
+out_unlock:
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb);
+
+static int __init amd_iommu_v2_init(void)
+{
+ size_t state_table_size;
+ int ret;
+
+ pr_info("AMD IOMMUv2 driver by Joerg Roedel <joerg.roedel@amd.com>");
+
+ spin_lock_init(&state_lock);
+
+ state_table_size = MAX_DEVICES * sizeof(struct device_state *);
+ state_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(state_table_size));
+ if (state_table == NULL)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ iommu_wq = create_workqueue("amd_iommu_v2");
+ if (iommu_wq == NULL)
+ goto out_free;
+
+ ret = -ENOMEM;
+ empty_page_table = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (empty_page_table == NULL)
+ goto out_destroy_wq;
+
+ amd_iommu_register_ppr_notifier(&ppr_nb);
+ profile_event_register(PROFILE_TASK_EXIT, &profile_nb);
+
+ return 0;
+
+out_destroy_wq:
+ destroy_workqueue(iommu_wq);
+
+out_free:
+ free_pages((unsigned long)state_table, get_order(state_table_size));
+
+ return ret;
+}
+
+static void __exit amd_iommu_v2_exit(void)
+{
+ struct device_state *dev_state;
+ size_t state_table_size;
+ int i;
+
+ profile_event_unregister(PROFILE_TASK_EXIT, &profile_nb);
+ amd_iommu_unregister_ppr_notifier(&ppr_nb);
+
+ flush_workqueue(iommu_wq);
+
+ /*
+ * The loop below might call flush_workqueue(), so call
+ * destroy_workqueue() after it
+ */
+ for (i = 0; i < MAX_DEVICES; ++i) {
+ dev_state = get_device_state(i);
+
+ if (dev_state == NULL)
+ continue;
+
+ WARN_ON_ONCE(1);
+
+ put_device_state(dev_state);
+ amd_iommu_free_device(dev_state->pdev);
+ }
+
+ destroy_workqueue(iommu_wq);
+
+ state_table_size = MAX_DEVICES * sizeof(struct device_state *);
+ free_pages((unsigned long)state_table, get_order(state_table_size));
+
+ free_page((unsigned long)empty_page_table);
+}
+
+module_init(amd_iommu_v2_init);
+module_exit(amd_iommu_v2_exit);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c0c7820d4c4..aea72e11e16 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -78,6 +78,24 @@
#define LEVEL_STRIDE (9)
#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KiB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KiB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
+
static inline int agaw_to_level(int agaw)
{
return agaw + 2;
@@ -405,6 +423,9 @@ int dmar_disabled = 0;
int dmar_disabled = 1;
#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
+int intel_iommu_enabled = 0;
+EXPORT_SYMBOL_GPL(intel_iommu_enabled);
+
static int dmar_map_gfx = 1;
static int dmar_forcedac;
static int intel_iommu_strict;
@@ -3524,7 +3545,7 @@ found:
return 0;
}
-int dmar_parse_rmrr_atsr_dev(void)
+int __init dmar_parse_rmrr_atsr_dev(void)
{
struct dmar_rmrr_unit *rmrr, *rmrr_n;
struct dmar_atsr_unit *atsr, *atsr_n;
@@ -3647,6 +3668,8 @@ int __init intel_iommu_init(void)
bus_register_notifier(&pci_bus_type, &device_nb);
+ intel_iommu_enabled = 1;
+
return 0;
}
@@ -3979,12 +4002,11 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
static int intel_iommu_map(struct iommu_domain *domain,
unsigned long iova, phys_addr_t hpa,
- int gfp_order, int iommu_prot)
+ size_t size, int iommu_prot)
{
struct dmar_domain *dmar_domain = domain->priv;
u64 max_addr;
int prot = 0;
- size_t size;
int ret;
if (iommu_prot & IOMMU_READ)
@@ -3994,7 +4016,6 @@ static int intel_iommu_map(struct iommu_domain *domain,
if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
prot |= DMA_PTE_SNP;
- size = PAGE_SIZE << gfp_order;
max_addr = iova + size;
if (dmar_domain->max_addr < max_addr) {
u64 end;
@@ -4017,11 +4038,10 @@ static int intel_iommu_map(struct iommu_domain *domain,
return ret;
}
-static int intel_iommu_unmap(struct iommu_domain *domain,
- unsigned long iova, int gfp_order)
+static size_t intel_iommu_unmap(struct iommu_domain *domain,
+ unsigned long iova, size_t size)
{
struct dmar_domain *dmar_domain = domain->priv;
- size_t size = PAGE_SIZE << gfp_order;
int order;
order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
@@ -4030,7 +4050,7 @@ static int intel_iommu_unmap(struct iommu_domain *domain,
if (dmar_domain->max_addr == iova + size)
dmar_domain->max_addr = iova;
- return order;
+ return PAGE_SIZE << order;
}
static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -4069,6 +4089,7 @@ static struct iommu_ops intel_iommu_ops = {
.unmap = intel_iommu_unmap,
.iova_to_phys = intel_iommu_iova_to_phys,
.domain_has_cap = intel_iommu_domain_has_cap,
+ .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
};
static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
index 07c9f189f31..6777ca04947 100644
--- a/drivers/iommu/intr_remapping.c
+++ b/drivers/iommu/intr_remapping.c
@@ -773,7 +773,7 @@ int __init parse_ioapics_under_ir(void)
return ir_supported;
}
-int ir_dev_scope_init(void)
+int __init ir_dev_scope_init(void)
{
if (!intr_remapping_enabled)
return 0;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2fb2963df55..14d6f908196 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -16,6 +16,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/bug.h>
@@ -90,7 +92,7 @@ struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
if (bus == NULL || bus->iommu_ops == NULL)
return NULL;
- domain = kmalloc(sizeof(*domain), GFP_KERNEL);
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
return NULL;
@@ -157,32 +159,125 @@ int iommu_domain_has_cap(struct iommu_domain *domain,
EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
int iommu_map(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, int gfp_order, int prot)
+ phys_addr_t paddr, size_t size, int prot)
{
- size_t size;
+ unsigned long orig_iova = iova;
+ unsigned int min_pagesz;
+ size_t orig_size = size;
+ int ret = 0;
if (unlikely(domain->ops->map == NULL))
return -ENODEV;
- size = PAGE_SIZE << gfp_order;
+ /* find out the minimum page size supported */
+ min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+
+ /*
+ * both the virtual address and the physical one, as well as
+ * the size of the mapping, must be aligned (at least) to the
+ * size of the smallest page supported by the hardware
+ */
+ if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+ pr_err("unaligned: iova 0x%lx pa 0x%lx size 0x%lx min_pagesz "
+ "0x%x\n", iova, (unsigned long)paddr,
+ (unsigned long)size, min_pagesz);
+ return -EINVAL;
+ }
+
+ pr_debug("map: iova 0x%lx pa 0x%lx size 0x%lx\n", iova,
+ (unsigned long)paddr, (unsigned long)size);
+
+ while (size) {
+ unsigned long pgsize, addr_merge = iova | paddr;
+ unsigned int pgsize_idx;
+
+ /* Max page size that still fits into 'size' */
+ pgsize_idx = __fls(size);
+
+ /* need to consider alignment requirements ? */
+ if (likely(addr_merge)) {
+ /* Max page size allowed by both iova and paddr */
+ unsigned int align_pgsize_idx = __ffs(addr_merge);
+
+ pgsize_idx = min(pgsize_idx, align_pgsize_idx);
+ }
+
+ /* build a mask of acceptable page sizes */
+ pgsize = (1UL << (pgsize_idx + 1)) - 1;
+
+ /* throw away page sizes not supported by the hardware */
+ pgsize &= domain->ops->pgsize_bitmap;
- BUG_ON(!IS_ALIGNED(iova | paddr, size));
+ /* make sure we're still sane */
+ BUG_ON(!pgsize);
- return domain->ops->map(domain, iova, paddr, gfp_order, prot);
+ /* pick the biggest page */
+ pgsize_idx = __fls(pgsize);
+ pgsize = 1UL << pgsize_idx;
+
+ pr_debug("mapping: iova 0x%lx pa 0x%lx pgsize %lu\n", iova,
+ (unsigned long)paddr, pgsize);
+
+ ret = domain->ops->map(domain, iova, paddr, pgsize, prot);
+ if (ret)
+ break;
+
+ iova += pgsize;
+ paddr += pgsize;
+ size -= pgsize;
+ }
+
+ /* unroll mapping in case something went wrong */
+ if (ret)
+ iommu_unmap(domain, orig_iova, orig_size - size);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(iommu_map);
-int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)
+size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
{
- size_t size;
+ size_t unmapped_page, unmapped = 0;
+ unsigned int min_pagesz;
if (unlikely(domain->ops->unmap == NULL))
return -ENODEV;
- size = PAGE_SIZE << gfp_order;
-
- BUG_ON(!IS_ALIGNED(iova, size));
-
- return domain->ops->unmap(domain, iova, gfp_order);
+ /* find out the minimum page size supported */
+ min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
+
+ /*
+ * The virtual address, as well as the size of the mapping, must be
+ * aligned (at least) to the size of the smallest page supported
+ * by the hardware
+ */
+ if (!IS_ALIGNED(iova | size, min_pagesz)) {
+ pr_err("unaligned: iova 0x%lx size 0x%lx min_pagesz 0x%x\n",
+ iova, (unsigned long)size, min_pagesz);
+ return -EINVAL;
+ }
+
+ pr_debug("unmap this: iova 0x%lx size 0x%lx\n", iova,
+ (unsigned long)size);
+
+ /*
+ * Keep iterating until we either unmap 'size' bytes (or more)
+ * or we hit an area that isn't mapped.
+ */
+ while (unmapped < size) {
+ size_t left = size - unmapped;
+
+ unmapped_page = domain->ops->unmap(domain, iova, left);
+ if (!unmapped_page)
+ break;
+
+ pr_debug("unmapped: iova 0x%lx size %lx\n", iova,
+ (unsigned long)unmapped_page);
+
+ iova += unmapped_page;
+ unmapped += unmapped_page;
+ }
+
+ return unmapped;
}
EXPORT_SYMBOL_GPL(iommu_unmap);
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 5865dd2e28f..08a90b88e40 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -42,6 +42,9 @@ __asm__ __volatile__ ( \
#define RCP15_PRRR(reg) MRC(reg, p15, 0, c10, c2, 0)
#define RCP15_NMRR(reg) MRC(reg, p15, 0, c10, c2, 1)
+/* bitmap of the page sizes currently supported */
+#define MSM_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M)
+
static int msm_iommu_tex_class[4];
DEFINE_SPINLOCK(msm_iommu_lock);
@@ -352,7 +355,7 @@ fail:
}
static int msm_iommu_map(struct iommu_domain *domain, unsigned long va,
- phys_addr_t pa, int order, int prot)
+ phys_addr_t pa, size_t len, int prot)
{
struct msm_priv *priv;
unsigned long flags;
@@ -363,7 +366,6 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long va,
unsigned long *sl_pte;
unsigned long sl_offset;
unsigned int pgprot;
- size_t len = 0x1000UL << order;
int ret = 0, tex, sh;
spin_lock_irqsave(&msm_iommu_lock, flags);
@@ -463,8 +465,8 @@ fail:
return ret;
}
-static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va,
- int order)
+static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long va,
+ size_t len)
{
struct msm_priv *priv;
unsigned long flags;
@@ -474,7 +476,6 @@ static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va,
unsigned long *sl_table;
unsigned long *sl_pte;
unsigned long sl_offset;
- size_t len = 0x1000UL << order;
int i, ret = 0;
spin_lock_irqsave(&msm_iommu_lock, flags);
@@ -544,15 +545,12 @@ static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va,
ret = __flush_iotlb(domain);
- /*
- * the IOMMU API requires us to return the order of the unmapped
- * page (on success).
- */
- if (!ret)
- ret = order;
fail:
spin_unlock_irqrestore(&msm_iommu_lock, flags);
- return ret;
+
+ /* the IOMMU API requires us to return how many bytes were unmapped */
+ len = ret ? 0 : len;
+ return len;
}
static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -684,7 +682,8 @@ static struct iommu_ops msm_iommu_ops = {
.map = msm_iommu_map,
.unmap = msm_iommu_unmap,
.iova_to_phys = msm_iommu_iova_to_phys,
- .domain_has_cap = msm_iommu_domain_has_cap
+ .domain_has_cap = msm_iommu_domain_has_cap,
+ .pgsize_bitmap = MSM_IOMMU_PGSIZES,
};
static int __init get_tex_class(int icp, int ocp, int mt, int nos)
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index cbcbf31ec3b..d8edd979d01 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -33,6 +33,9 @@
(__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true); \
__i++)
+/* bitmap of the page sizes currently supported */
+#define OMAP_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M)
+
/**
* struct omap_iommu_domain - omap iommu domain
* @pgtable: the page table
@@ -1011,12 +1014,11 @@ static void iopte_cachep_ctor(void *iopte)
}
static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
- phys_addr_t pa, int order, int prot)
+ phys_addr_t pa, size_t bytes, int prot)
{
struct omap_iommu_domain *omap_domain = domain->priv;
struct omap_iommu *oiommu = omap_domain->iommu_dev;
struct device *dev = oiommu->dev;
- size_t bytes = PAGE_SIZE << order;
struct iotlb_entry e;
int omap_pgsz;
u32 ret, flags;
@@ -1041,19 +1043,16 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
return ret;
}
-static int omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
- int order)
+static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
+ size_t size)
{
struct omap_iommu_domain *omap_domain = domain->priv;
struct omap_iommu *oiommu = omap_domain->iommu_dev;
struct device *dev = oiommu->dev;
- size_t unmap_size;
-
- dev_dbg(dev, "unmapping da 0x%lx order %d\n", da, order);
- unmap_size = iopgtable_clear_entry(oiommu, da);
+ dev_dbg(dev, "unmapping da 0x%lx size %u\n", da, size);
- return unmap_size ? get_order(unmap_size) : -EINVAL;
+ return iopgtable_clear_entry(oiommu, da);
}
static int
@@ -1205,6 +1204,7 @@ static struct iommu_ops omap_iommu_ops = {
.unmap = omap_iommu_unmap,
.iova_to_phys = omap_iommu_iova_to_phys,
.domain_has_cap = omap_iommu_domain_has_cap,
+ .pgsize_bitmap = OMAP_IOMMU_PGSIZES,
};
static int __init omap_iommu_init(void)
diff --git a/drivers/iommu/omap-iovmm.c b/drivers/iommu/omap-iovmm.c
index 23655945e30..2e10c3e0a7a 100644
--- a/drivers/iommu/omap-iovmm.c
+++ b/drivers/iommu/omap-iovmm.c
@@ -413,7 +413,6 @@ static int map_iovm_area(struct iommu_domain *domain, struct iovm_struct *new,
unsigned int i, j;
struct scatterlist *sg;
u32 da = new->da_start;
- int order;
if (!domain || !sgt)
return -EINVAL;
@@ -432,12 +431,10 @@ static int map_iovm_area(struct iommu_domain *domain, struct iovm_struct *new,
if (bytes_to_iopgsz(bytes) < 0)
goto err_out;
- order = get_order(bytes);
-
pr_debug("%s: [%d] %08x %08x(%x)\n", __func__,
i, da, pa, bytes);
- err = iommu_map(domain, da, pa, order, flags);
+ err = iommu_map(domain, da, pa, bytes, flags);
if (err)
goto err_out;
@@ -452,10 +449,9 @@ err_out:
size_t bytes;
bytes = sg->length + sg->offset;
- order = get_order(bytes);
/* ignore failures.. we're already handling one */
- iommu_unmap(domain, da, order);
+ iommu_unmap(domain, da, bytes);
da += bytes;
}
@@ -470,7 +466,8 @@ static void unmap_iovm_area(struct iommu_domain *domain, struct omap_iommu *obj,
size_t total = area->da_end - area->da_start;
const struct sg_table *sgt = area->sgt;
struct scatterlist *sg;
- int i, err;
+ int i;
+ size_t unmapped;
BUG_ON(!sgtable_ok(sgt));
BUG_ON((!total) || !IS_ALIGNED(total, PAGE_SIZE));
@@ -478,13 +475,11 @@ static void unmap_iovm_area(struct iommu_domain *domain, struct omap_iommu *obj,
start = area->da_start;
for_each_sg(sgt->sgl, sg, sgt->nents, i) {
size_t bytes;
- int order;
bytes = sg->length + sg->offset;
- order = get_order(bytes);
- err = iommu_unmap(domain, start, order);
- if (err < 0)
+ unmapped = iommu_unmap(domain, start, bytes);
+ if (unmapped < bytes)
break;
dev_dbg(obj->dev, "%s: unmap %08x(%x) %08x\n",