diff options
Diffstat (limited to 'drivers/pci')
-rw-r--r-- | drivers/pci/Makefile | 3 | ||||
-rw-r--r-- | drivers/pci/dmar.c | 329 | ||||
-rw-r--r-- | drivers/pci/intel-iommu.c | 2271 | ||||
-rw-r--r-- | drivers/pci/intel-iommu.h | 325 | ||||
-rw-r--r-- | drivers/pci/iova.c | 394 | ||||
-rw-r--r-- | drivers/pci/iova.h | 63 | ||||
-rw-r--r-- | drivers/pci/pci.h | 1 | ||||
-rw-r--r-- | drivers/pci/probe.c | 14 | ||||
-rw-r--r-- | drivers/pci/search.c | 34 |
9 files changed, 3434 insertions, 0 deletions
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 006054a4099..55505565073 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -20,6 +20,9 @@ obj-$(CONFIG_PCI_MSI) += msi.o # Build the Hypertransport interrupt support obj-$(CONFIG_HT_IRQ) += htirq.o +# Build Intel IOMMU support +obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o + # # Some architectures use the generic PCI setup functions # diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c new file mode 100644 index 00000000000..5dfdfdac92e --- /dev/null +++ b/drivers/pci/dmar.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + * + * This file implements early detection/parsing of DMA Remapping Devices + * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI + * tables. + */ + +#include <linux/pci.h> +#include <linux/dmar.h> + +#undef PREFIX +#define PREFIX "DMAR:" + +/* No locks are needed as DMA remapping hardware unit + * list is constructed at boot time and hotplug of + * these units are not supported by the architecture. + */ +LIST_HEAD(dmar_drhd_units); +LIST_HEAD(dmar_rmrr_units); + +static struct acpi_table_header * __initdata dmar_tbl; + +static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd) +{ + /* + * add INCLUDE_ALL at the tail, so scan the list will find it at + * the very end. + */ + if (drhd->include_all) + list_add_tail(&drhd->list, &dmar_drhd_units); + else + list_add(&drhd->list, &dmar_drhd_units); +} + +static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr) +{ + list_add(&rmrr->list, &dmar_rmrr_units); +} + +static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, + struct pci_dev **dev, u16 segment) +{ + struct pci_bus *bus; + struct pci_dev *pdev = NULL; + struct acpi_dmar_pci_path *path; + int count; + + bus = pci_find_bus(segment, scope->bus); + path = (struct acpi_dmar_pci_path *)(scope + 1); + count = (scope->length - sizeof(struct acpi_dmar_device_scope)) + / sizeof(struct acpi_dmar_pci_path); + + while (count) { + if (pdev) + pci_dev_put(pdev); + /* + * Some BIOSes list non-exist devices in DMAR table, just + * ignore it + */ + if (!bus) { + printk(KERN_WARNING + PREFIX "Device scope bus [%d] not found\n", + scope->bus); + break; + } + pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn)); + if (!pdev) { + printk(KERN_WARNING PREFIX + "Device scope device [%04x:%02x:%02x.%02x] not found\n", + segment, bus->number, path->dev, path->fn); + break; + } + path ++; + count --; + bus = pdev->subordinate; + } + if (!pdev) { + printk(KERN_WARNING PREFIX + "Device scope device [%04x:%02x:%02x.%02x] not found\n", + segment, scope->bus, path->dev, path->fn); + *dev = NULL; + return 0; + } + if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && \ + pdev->subordinate) || (scope->entry_type == \ + ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) { + pci_dev_put(pdev); + printk(KERN_WARNING PREFIX + "Device scope type does not match for %s\n", + pci_name(pdev)); + return -EINVAL; + } + *dev = pdev; + return 0; +} + +static int __init dmar_parse_dev_scope(void *start, void *end, int *cnt, + struct pci_dev ***devices, u16 segment) +{ + struct acpi_dmar_device_scope *scope; + void * tmp = start; + int index; + int ret; + + *cnt = 0; + while (start < end) { + scope = start; + if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT || + scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) + (*cnt)++; + else + printk(KERN_WARNING PREFIX + "Unsupported device scope\n"); + start += scope->length; + } + if (*cnt == 0) + return 0; + + *devices = kcalloc(*cnt, sizeof(struct pci_dev *), GFP_KERNEL); + if (!*devices) + return -ENOMEM; + + start = tmp; + index = 0; + while (start < end) { + scope = start; + if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT || + scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) { + ret = dmar_parse_one_dev_scope(scope, + &(*devices)[index], segment); + if (ret) { + kfree(*devices); + return ret; + } + index ++; + } + start += scope->length; + } + + return 0; +} + +/** + * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition + * structure which uniquely represent one DMA remapping hardware unit + * present in the platform + */ +static int __init +dmar_parse_one_drhd(struct acpi_dmar_header *header) +{ + struct acpi_dmar_hardware_unit *drhd; + struct dmar_drhd_unit *dmaru; + int ret = 0; + static int include_all; + + dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL); + if (!dmaru) + return -ENOMEM; + + drhd = (struct acpi_dmar_hardware_unit *)header; + dmaru->reg_base_addr = drhd->address; + dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */ + + if (!dmaru->include_all) + ret = dmar_parse_dev_scope((void *)(drhd + 1), + ((void *)drhd) + header->length, + &dmaru->devices_cnt, &dmaru->devices, + drhd->segment); + else { + /* Only allow one INCLUDE_ALL */ + if (include_all) { + printk(KERN_WARNING PREFIX "Only one INCLUDE_ALL " + "device scope is allowed\n"); + ret = -EINVAL; + } + include_all = 1; + } + + if (ret || (dmaru->devices_cnt == 0 && !dmaru->include_all)) + kfree(dmaru); + else + dmar_register_drhd_unit(dmaru); + return ret; +} + +static int __init +dmar_parse_one_rmrr(struct acpi_dmar_header *header) +{ + struct acpi_dmar_reserved_memory *rmrr; + struct dmar_rmrr_unit *rmrru; + int ret = 0; + + rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); + if (!rmrru) + return -ENOMEM; + + rmrr = (struct acpi_dmar_reserved_memory *)header; + rmrru->base_address = rmrr->base_address; + rmrru->end_address = rmrr->end_address; + ret = dmar_parse_dev_scope((void *)(rmrr + 1), + ((void *)rmrr) + header->length, + &rmrru->devices_cnt, &rmrru->devices, rmrr->segment); + + if (ret || (rmrru->devices_cnt == 0)) + kfree(rmrru); + else + dmar_register_rmrr_unit(rmrru); + return ret; +} + +static void __init +dmar_table_print_dmar_entry(struct acpi_dmar_header *header) +{ + struct acpi_dmar_hardware_unit *drhd; + struct acpi_dmar_reserved_memory *rmrr; + + switch (header->type) { + case ACPI_DMAR_TYPE_HARDWARE_UNIT: + drhd = (struct acpi_dmar_hardware_unit *)header; + printk (KERN_INFO PREFIX + "DRHD (flags: 0x%08x)base: 0x%016Lx\n", + drhd->flags, drhd->address); + break; + case ACPI_DMAR_TYPE_RESERVED_MEMORY: + rmrr = (struct acpi_dmar_reserved_memory *)header; + + printk (KERN_INFO PREFIX + "RMRR base: 0x%016Lx end: 0x%016Lx\n", + rmrr->base_address, rmrr->end_address); + break; + } +} + +/** + * parse_dmar_table - parses the DMA reporting table + */ +static int __init +parse_dmar_table(void) +{ + struct acpi_table_dmar *dmar; + struct acpi_dmar_header *entry_header; + int ret = 0; + + dmar = (struct acpi_table_dmar *)dmar_tbl; + if (!dmar) + return -ENODEV; + + if (!dmar->width) { + printk (KERN_WARNING PREFIX "Zero: Invalid DMAR haw\n"); + return -EINVAL; + } + + printk (KERN_INFO PREFIX "Host address width %d\n", + dmar->width + 1); + + entry_header = (struct acpi_dmar_header *)(dmar + 1); + while (((unsigned long)entry_header) < + (((unsigned long)dmar) + dmar_tbl->length)) { + dmar_table_print_dmar_entry(entry_header); + + switch (entry_header->type) { + case ACPI_DMAR_TYPE_HARDWARE_UNIT: + ret = dmar_parse_one_drhd(entry_header); + break; + case ACPI_DMAR_TYPE_RESERVED_MEMORY: + ret = dmar_parse_one_rmrr(entry_header); + break; + default: + printk(KERN_WARNING PREFIX + "Unknown DMAR structure type\n"); + ret = 0; /* for forward compatibility */ + break; + } + if (ret) + break; + + entry_header = ((void *)entry_header + entry_header->length); + } + return ret; +} + + +int __init dmar_table_init(void) +{ + + parse_dmar_table(); + if (list_empty(&dmar_drhd_units)) { + printk(KERN_INFO PREFIX "No DMAR devices found\n"); + return -ENODEV; + } + return 0; +} + +/** + * early_dmar_detect - checks to see if the platform supports DMAR devices + */ +int __init early_dmar_detect(void) +{ + acpi_status status = AE_OK; + + /* if we could find DMAR table, then there are DMAR devices */ + status = acpi_get_table(ACPI_SIG_DMAR, 0, + (struct acpi_table_header **)&dmar_tbl); + + if (ACPI_SUCCESS(status) && !dmar_tbl) { + printk (KERN_WARNING PREFIX "Unable to map DMAR\n"); + status = AE_NOT_FOUND; + } + + return (ACPI_SUCCESS(status) ? 1 : 0); +} diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c new file mode 100644 index 00000000000..b3d70310af4 --- /dev/null +++ b/drivers/pci/intel-iommu.c @@ -0,0 +1,2271 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Shaohua Li <shaohua.li@intel.com> + * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + */ + +#include <linux/init.h> +#include <linux/bitmap.h> +#include <linux/slab.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/sysdev.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/dma-mapping.h> +#include <linux/mempool.h> +#include "iova.h" +#include "intel-iommu.h" +#include <asm/proto.h> /* force_iommu in this header in x86-64*/ +#include <asm/cacheflush.h> +#include <asm/iommu.h> +#include "pci.h" + +#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) +#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) + +#define IOAPIC_RANGE_START (0xfee00000) +#define IOAPIC_RANGE_END (0xfeefffff) +#define IOVA_START_ADDR (0x1000) + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 + +#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */ + +#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1) + +static void domain_remove_dev_info(struct dmar_domain *domain); + +static int dmar_disabled; +static int __initdata dmar_map_gfx = 1; +static int dmar_forcedac; + +#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) +static DEFINE_SPINLOCK(device_domain_lock); +static LIST_HEAD(device_domain_list); + +static int __init intel_iommu_setup(char *str) +{ + if (!str) + return -EINVAL; + while (*str) { + if (!strncmp(str, "off", 3)) { + dmar_disabled = 1; + printk(KERN_INFO"Intel-IOMMU: disabled\n"); + } else if (!strncmp(str, "igfx_off", 8)) { + dmar_map_gfx = 0; + printk(KERN_INFO + "Intel-IOMMU: disable GFX device mapping\n"); + } else if (!strncmp(str, "forcedac", 8)) { + printk (KERN_INFO + "Intel-IOMMU: Forcing DAC for PCI devices\n"); + dmar_forcedac = 1; + } + + str += strcspn(str, ","); + while (*str == ',') + str++; + } + return 0; +} +__setup("intel_iommu=", intel_iommu_setup); + +static struct kmem_cache *iommu_domain_cache; +static struct kmem_cache *iommu_devinfo_cache; +static struct kmem_cache *iommu_iova_cache; + +static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep) +{ + unsigned int flags; + void *vaddr; + + /* trying to avoid low memory issues */ + flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC); + current->flags &= (~PF_MEMALLOC | flags); + return vaddr; +} + + +static inline void *alloc_pgtable_page(void) +{ + unsigned int flags; + void *vaddr; + + /* trying to avoid low memory issues */ + flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + vaddr = (void *)get_zeroed_page(GFP_ATOMIC); + current->flags &= (~PF_MEMALLOC | flags); + return vaddr; +} + +static inline void free_pgtable_page(void *vaddr) +{ + free_page((unsigned long)vaddr); +} + +static inline void *alloc_domain_mem(void) +{ + return iommu_kmem_cache_alloc(iommu_domain_cache); +} + +static inline void free_domain_mem(void *vaddr) +{ + kmem_cache_free(iommu_domain_cache, vaddr); +} + +static inline void * alloc_devinfo_mem(void) +{ + return iommu_kmem_cache_alloc(iommu_devinfo_cache); +} + +static inline void free_devinfo_mem(void *vaddr) +{ + kmem_cache_free(iommu_devinfo_cache, vaddr); +} + +struct iova *alloc_iova_mem(void) +{ + return iommu_kmem_cache_alloc(iommu_iova_cache); +} + +void free_iova_mem(struct iova *iova) +{ + kmem_cache_free(iommu_iova_cache, iova); +} + +static inline void __iommu_flush_cache( + struct intel_iommu *iommu, void *addr, int size) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(addr, size); +} + +/* Gets context entry for a given bus and devfn */ +static struct context_entry * device_to_context_entry(struct intel_iommu *iommu, + u8 bus, u8 devfn) +{ + struct root_entry *root; + struct context_entry *context; + unsigned long phy_addr; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + root = &iommu->root_entry[bus]; + context = get_context_addr_from_root(root); + if (!context) { + context = (struct context_entry *)alloc_pgtable_page(); + if (!context) { + spin_unlock_irqrestore(&iommu->lock, flags); + return NULL; + } + __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K); + phy_addr = virt_to_phys((void *)context); + set_root_value(root, phy_addr); + set_root_present(root); + __iommu_flush_cache(iommu, root, sizeof(*root)); + } + spin_unlock_irqrestore(&iommu->lock, flags); + return &context[devfn]; +} + +static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + struct root_entry *root; + struct context_entry *context; + int ret; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + root = &iommu->root_entry[bus]; + context = get_context_addr_from_root(root); + if (!context) { + ret = 0; + goto out; + } + ret = context_present(context[devfn]); +out: + spin_unlock_irqrestore(&iommu->lock, flags); + return ret; +} + +static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + struct root_entry *root; + struct context_entry *context; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + root = &iommu->root_entry[bus]; + context = get_context_addr_from_root(root); + if (context) { + context_clear_entry(context[devfn]); + __iommu_flush_cache(iommu, &context[devfn], \ + sizeof(*context)); + } + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void free_context_table(struct intel_iommu *iommu) +{ + struct root_entry *root; + int i; + unsigned long flags; + struct context_entry *context; + + spin_lock_irqsave(&iommu->lock, flags); + if (!iommu->root_entry) { + goto out; + } + for (i = 0; i < ROOT_ENTRY_NR; i++) { + root = &iommu->root_entry[i]; + context = get_context_addr_from_root(root); + if (context) + free_pgtable_page(context); + } + free_pgtable_page(iommu->root_entry); + iommu->root_entry = NULL; +out: + spin_unlock_irqrestore(&iommu->lock, flags); +} + +/* page table handling */ +#define LEVEL_STRIDE (9) +#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) + +static inline int agaw_to_level(int agaw) +{ + return agaw + 2; +} + +static inline int agaw_to_width(int agaw) +{ + return 30 + agaw * LEVEL_STRIDE; + +} + +static inline int width_to_agaw(int width) +{ + return (width - 30) / LEVEL_STRIDE; +} + +static inline unsigned int level_to_offset_bits(int level) +{ + return (12 + (level - 1) * LEVEL_STRIDE); +} + +static inline int address_level_offset(u64 addr, int level) +{ + return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK); +} + +static inline u64 level_mask(int level) +{ + return ((u64)-1 << level_to_offset_bits(level)); +} + +static inline u64 level_size(int level) +{ + return ((u64)1 << level_to_offset_bits(level)); +} + +static inline u64 align_to_level(u64 addr, int level) +{ + return ((addr + level_size(level) - 1) & level_mask(level)); +} + +static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) +{ + int addr_width = agaw_to_width(domain->agaw); + struct dma_pte *parent, *pte = NULL; + int level = agaw_to_level(domain->agaw); + int offset; + unsigned long flags; + + BUG_ON(!domain->pgd); + + addr &= (((u64)1) << addr_width) - 1; + parent = domain->pgd; + + spin_lock_irqsave(&domain->mapping_lock, flags); + while (level > 0) { + void *tmp_page; + + offset = address_level_offset(addr, level); + pte = &parent[offset]; + if (level == 1) + break; + + if (!dma_pte_present(*pte)) { + tmp_page = alloc_pgtable_page(); + + if (!tmp_page) { + spin_unlock_irqrestore(&domain->mapping_lock, + flags); + return NULL; + } + __iommu_flush_cache(domain->iommu, tmp_page, + PAGE_SIZE_4K); + dma_set_pte_addr(*pte, virt_to_phys(tmp_page)); + /* + * high level table always sets r/w, last level page + * table control read/write + */ + dma_set_pte_readable(*pte); + dma_set_pte_writable(*pte); + __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); + } + parent = phys_to_virt(dma_pte_addr(*pte)); + level--; + } + + spin_unlock_irqrestore(&domain->mapping_lock, flags); + return pte; +} + +/* return address's pte at specific level */ +static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, + int level) +{ + struct dma_pte *parent, *pte = NULL; + int total = agaw_to_level(domain->agaw); + int offset; + + parent = domain->pgd; + while (level <= total) { + offset = address_level_offset(addr, total); + pte = &parent[offset]; + if (level == total) + return pte; + + if (!dma_pte_present(*pte)) + break; + parent = phys_to_virt(dma_pte_addr(*pte)); + total--; + } + return NULL; +} + +/* clear one page's page table */ +static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) +{ + struct dma_pte *pte = NULL; + + /* get last level pte */ + pte = dma_addr_level_pte(domain, addr, 1); + + if (pte) { + dma_clear_pte(*pte); + __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); + } +} + +/* clear last level pte, a tlb flush should be followed */ +static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) +{ + int addr_width = agaw_to_width(domain->agaw); + + start &= (((u64)1) << addr_width) - 1; + end &= (((u64)1) << addr_width) - 1; + /* in case it's partial page */ + start = PAGE_ALIGN_4K(start); + end &= PAGE_MASK_4K; + + /* we don't need lock here, nobody else touches the iova range */ + while (start < end) { + dma_pte_clear_one(domain, start); + start += PAGE_SIZE_4K; + } +} + +/* free page table pages. last level pte should already be cleared */ +static void dma_pte_free_pagetable(struct dmar_domain *domain, + u64 start, u64 end) +{ + int addr_width = agaw_to_width(domain->agaw); + struct dma_pte *pte; + int total = agaw_to_level(domain->agaw); + int level; + u64 tmp; + + start &= (((u64)1) << addr_width) - 1; + end &= (((u64)1) << addr_width) - 1; + + /* we don't need lock here, nobody else touches the iova range */ + level = 2; + while (level <= total) { + tmp = align_to_level(start, level); + if (tmp >= end || (tmp + level_size(level) > end)) + return; + + while (tmp < end) { + pte = dma_addr_level_pte(domain, tmp, level); + if (pte) { + free_pgtable_page( + phys_to_virt(dma_pte_addr(*pte))); + dma_clear_pte(*pte); + __iommu_flush_cache(domain->iommu, + pte, sizeof(*pte)); + } + tmp += level_size(level); + } + level++; + } + /* free pgd */ + if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) { + free_pgtable_page(domain->pgd); + domain->pgd = NULL; + } +} + +/* iommu handling */ +static int iommu_alloc_root_entry(struct intel_iommu *iommu) +{ + struct root_entry *root; + unsigned long flags; + + root = (struct root_entry *)alloc_pgtable_page(); + if (!root) + return -ENOMEM; + + __iommu_flush_cache(iommu, root, PAGE_SIZE_4K); + + spin_lock_irqsave(&iommu->lock, flags); + iommu->root_entry = root; + spin_unlock_irqrestore(&iommu->lock, flags); + + return 0; +} + +#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ +{\ + unsigned long start_time = jiffies;\ + while (1) {\ + sts = op (iommu->reg + offset);\ + if (cond)\ + break;\ + if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\ + panic("DMAR hardware is malfunctioning\n");\ + cpu_relax();\ + }\ +} + +static void iommu_set_root_entry(struct intel_iommu *iommu) +{ + void *addr; + u32 cmd, sts; + unsigned long flag; + + addr = iommu->root_entry; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); + + cmd = iommu->gcmd | DMA_GCMD_SRTP; + writel(cmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_RTPS), sts); + + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static void iommu_flush_write_buffer(struct intel_iommu *iommu) +{ + u32 val; + unsigned long flag; + + if (!cap_rwbf(iommu->cap)) + return; + val = iommu->gcmd | DMA_GCMD_WBF; + + spin_lock_irqsave(&iommu->register_lock, flag); + writel(val, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(val & DMA_GSTS_WBFS)), val); + + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static int __iommu_flush_context(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, u64 type, + int non_present_entry_flush) +{ + u64 val = 0; + unsigned long flag; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if (non_present_entry_flush) { + if (!cap_caching_mode(iommu->cap)) + return 1; + else + did = 0; + } + + switch (type) { + case DMA_CCMD_GLOBAL_INVL: + val = DMA_CCMD_GLOBAL_INVL; + break; + case DMA_CCMD_DOMAIN_INVL: + val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); + break; + case DMA_CCMD_DEVICE_INVL: + val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) + | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); + break; + default: + BUG(); + } + val |= DMA_CCMD_ICC; + + spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, + dmar_readq, (!(val & DMA_CCMD_ICC)), val); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* flush context entry will implictly flush write buffer */ + return 0; +} + +static int inline iommu_flush_context_global(struct intel_iommu *iommu, + int non_present_entry_flush) +{ + return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did, + int non_present_entry_flush) +{ + return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL, + non_present_entry_flush); +} + +static int inline iommu_flush_context_device(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush) +{ + return __iommu_flush_context(iommu, did, source_id, function_mask, + DMA_CCMD_DEVICE_INVL, non_present_entry_flush); +} + +/* return value determine if we need a write buffer flush */ +static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int size_order, u64 type, + int non_present_entry_flush) +{ + int tlb_offset = ecap_iotlb_offset(iommu->ecap); + u64 val = 0, val_iva = 0; + unsigned long flag; + + /* + * In the non-present entry flush case, if hardware doesn't cache + * non-present entry we do nothing and if hardware cache non-present + * entry, we flush entries of domain 0 (the domain id is used to cache + * any non-present entries) + */ + if (non_present_entry_flush) { + if (!cap_caching_mode(iommu->cap)) + return 1; + else + did = 0; + } + + switch (type) { + case DMA_TLB_GLOBAL_FLUSH: + /* global flush doesn't need set IVA_REG */ + val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; + break; + case DMA_TLB_DSI_FLUSH: + val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + break; + case DMA_TLB_PSI_FLUSH: + val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + /* Note: always flush non-leaf currently */ + val_iva = size_order | addr; + break; + default: + BUG(); + } + /* Note: set drain read/write */ +#if 0 + /* + * This is probably to be super secure.. Looks like we can + * ignore it without any impact. + */ + if (cap_read_drain(iommu->cap)) + val |= DMA_TLB_READ_DRAIN; +#endif + if (cap_write_drain(iommu->cap)) + val |= DMA_TLB_WRITE_DRAIN; + + spin_lock_irqsave(&iommu->register_lock, flag); + /* Note: Only uses first TLB reg currently */ + if (val_iva) + dmar_writeq(iommu->reg + tlb_offset, val_iva); + dmar_writeq(iommu->reg + tlb_offset + 8, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, tlb_offset + 8, + dmar_readq, (!(val & DMA_TLB_IVT)), val); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* check IOTLB invalidation granularity */ + if (DMA_TLB_IAIG(val) == 0) + printk(KERN_ERR"IOMMU: flush IOTLB failed\n"); + if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) + pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", + DMA_TLB_IIRG(type), DMA_TLB_IAIG(val)); + /* flush context entry will implictly flush write buffer */ + return 0; +} + +static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu, + int non_present_entry_flush) +{ + return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, + non_present_entry_flush); +} + +static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did, + int non_present_entry_flush) +{ + return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, + non_present_entry_flush); +} + +static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int pages, int non_present_entry_flush) +{ + unsigned int mask; + + BUG_ON(addr & (~PAGE_MASK_4K)); + BUG_ON(pages == 0); + + /* Fallback to domain selective flush if no PSI support */ + if (!cap_pgsel_inv(iommu->cap)) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + /* + * PSI requires page size to be 2 ^ x, and the base address is naturally + * aligned to the size + */ + mask = ilog2(__roundup_pow_of_two(pages)); + /* Fallback to domain selective flush if size is too big */ + if (mask > cap_max_amask_val(iommu->cap)) + return iommu_flush_iotlb_dsi(iommu, did, + non_present_entry_flush); + + return __iommu_flush_iotlb(iommu, did, addr, mask, + DMA_TLB_PSI_FLUSH, non_present_entry_flush); +} + +static int iommu_enable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flags; + + spin_lock_irqsave(&iommu->register_lock, flags); + writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_TES), sts); + + iommu->gcmd |= DMA_GCMD_TE; + spin_unlock_irqrestore(&iommu->register_lock, flags); + return 0; +} + +static int iommu_disable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flag; + + spin_lock_irqsave(&iommu->register_lock, flag); + iommu->gcmd &= ~DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(sts & DMA_GSTS_TES)), sts); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + return 0; +} + +/* iommu interrupt handling. Most stuff are MSI-like. */ + +static char *fault_reason_strings[] = +{ + "Software", + "Present bit in root entry is clear", + "Present bit in context entry is clear", + "Invalid context entry", + "Access beyond MGAW", + "PTE Write access is not set", + "PTE Read access is not set", + "Next page table ptr is invalid", + "Root table address invalid", + "Context table ptr is invalid", + "non-zero reserved fields in RTP", + "non-zero reserved fields in CTP", + "non-zero reserved fields in PTE", + "Unknown" +}; +#define MAX_FAULT_REASON_IDX ARRAY_SIZE(fault_reason_strings) + +char *dmar_get_fault_reason(u8 fault_reason) +{ + if (fault_reason > MAX_FAULT_REASON_IDX) + return fault_reason_strings[MAX_FAULT_REASON_IDX]; + else + return fault_reason_strings[fault_reason]; +} + +void dmar_msi_unmask(unsigned int irq) +{ + struct intel_iommu *iommu = get_irq_data(irq); + unsigned long flag; + + /* unmask it */ + spin_lock_irqsave(&iommu->register_lock, flag); + writel(0, iommu->reg + DMAR_FECTL_REG); + /* Read a reg to force flush the post write */ + readl(iommu->reg + DMAR_FECTL_REG); + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_mask(unsigned int irq) +{ + unsigned long flag; + struct intel_iommu *iommu = get_irq_data(irq); + + /* mask it */ + spin_lock_irqsave(&iommu->register_lock, flag); + writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG); + /* Read a reg to force flush the post write */ + readl(iommu->reg + DMAR_FECTL_REG); + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_write(int irq, struct msi_msg *msg) +{ + struct intel_iommu *iommu = get_irq_data(irq); + unsigned long flag; + + spin_lock_irqsave(&iommu->register_lock, flag); + writel(msg->data, iommu->reg + DMAR_FEDATA_REG); + writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG); + writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG); + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_read(int irq, struct msi_msg *msg) +{ + struct intel_iommu *iommu = get_irq_data(irq); + unsigned long flag; + + spin_lock_irqsave(&iommu->register_lock, flag); + msg->data = readl(iommu->reg + DMAR_FEDATA_REG); + msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG); + msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG); + spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type, + u8 fault_reason, u16 source_id, u64 addr) +{ + char *reason; + + reason = dmar_get_fault_reason(fault_reason); + + printk(KERN_ERR + "DMAR:[%s] Request device [%02x:%02x.%d] " + "fault addr %llx \n" + "DMAR:[fault reason %02d] %s\n", + (type ? "DMA Read" : "DMA Write"), + (source_id >> 8), PCI_SLOT(source_id & 0xFF), + PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason); + return 0; +} + +#define PRIMARY_FAULT_REG_LEN (16) +static irqreturn_t iommu_page_fault(int irq, void *dev_id) +{ + struct intel_iommu *iommu = dev_id; + int reg, fault_index; + u32 fault_status; + unsigned long flag; + + spin_lock_irqsave(&iommu->register_lock, flag); + fault_status = readl(iommu->reg + DMAR_FSTS_REG); + + /* TBD: ignore advanced fault log currently */ + if (!(fault_status & DMA_FSTS_PPF)) + goto clear_overflow; + + fault_index = dma_fsts_fault_record_index(fault_status); + reg = cap_fault_reg_offset(iommu->cap); + while (1) { + u8 fault_reason; + u16 source_id; + u64 guest_addr; + int type; + u32 data; + + /* highest 32 bits */ + data = readl(iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12); + if (!(data & DMA_FRCD_F)) + break; + + fault_reason = dma_frcd_fault_reason(data); + type = dma_frcd_type(data); + + data = readl(iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN + 8); + source_id = dma_frcd_source_id(data); + + guest_addr = dmar_readq(iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN); + guest_addr = dma_frcd_page_addr(guest_addr); + /* clear the fault */ + writel(DMA_FRCD_F, iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + + iommu_page_fault_do_one(iommu, type, fault_reason, + source_id, guest_addr); + + fault_index++; + if (fault_index > cap_num_fault_regs(iommu->cap)) + fault_index = 0; + spin_lock_irqsave(&iommu->register_lock, flag); + } +clear_overflow: + /* clear primary fault overflow */ + fault_status = readl(iommu->reg + DMAR_FSTS_REG); + if (fault_status & DMA_FSTS_PFO) + writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG); + + spin_unlock_irqrestore(&iommu->register_lock, flag); + return IRQ_HANDLED; +} + +int dmar_set_interrupt(struct intel_iommu *iommu) +{ + int irq, ret; + + irq = create_irq(); + if (!irq) { + printk(KERN_ERR "IOMMU: no free vectors\n"); + return -EINVAL; + } + + set_irq_data(irq, iommu); + iommu->irq = irq; + + ret = arch_setup_dmar_msi(irq); + if (ret) { + set_irq_data(irq, NULL); + iommu->irq = 0; + destroy_irq(irq); + return 0; + } + + /* Force fault register is cleared */ + iommu_page_fault(irq, iommu); + + ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu); + if (ret) + printk(KERN_ERR "IOMMU: can't request irq\n"); + return ret; +} + +static int iommu_init_domains(struct intel_iommu *iommu) +{ + unsigned long ndomains; + unsigned long nlongs; + + ndomains = cap_ndoms(iommu->cap); + pr_debug("Number of Domains supportd <%ld>\n", ndomains); + nlongs = BITS_TO_LONGS(ndomains); + + /* TBD: there might be 64K domains, + * consider other allocation for future chip + */ + iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); + if (!iommu->domain_ids) { + printk(KERN_ERR "Allocating domain id array failed\n"); + return -ENOMEM; + } + iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *), + GFP_KERNEL); + if (!iommu->domains) { + printk(KERN_ERR "Allocating domain array failed\n"); + kfree(iommu->domain_ids); + return -ENOMEM; + } + + /* + * if Caching mode is set, then invalid translations are tagged + * with domainid 0. Hence we need to pre-allocate it. + */ + if (cap_caching_mode(iommu->cap)) + set_bit(0, iommu->domain_ids); + return 0; +} + +static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd) +{ + struct intel_iommu *iommu; + int ret; + int map_size; + u32 ver; + + iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return NULL; + iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K); + if (!iommu->reg) { + printk(KERN_ERR "IOMMU: can't map the region\n"); + goto error; + } + iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); + iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); + + /* the registers might be more than one page */ + map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), + cap_max_fault_reg_offset(iommu->cap)); + map_size = PAGE_ALIGN_4K(map_size); + if (map_size > PAGE_SIZE_4K) { + iounmap(iommu->reg); + iommu->reg = ioremap(drhd->reg_base_addr, map_size); + if (!iommu->reg) { + printk(KERN_ERR "IOMMU: can't map the region\n"); + goto error; + } + } + + ver = readl(iommu->reg + DMAR_VER_REG); + pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n", + drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver), + iommu->cap, iommu->ecap); + ret = iommu_init_domains(iommu); + if (ret) + goto error_unmap; + spin_lock_init(&iommu->lock); + spin_lock_init(&iommu->register_lock); + + drhd->iommu = iommu; + return iommu; +error_unmap: + iounmap(iommu->reg); + iommu->reg = 0; +error: + kfree(iommu); + return NULL; +} + +static void domain_exit(struct dmar_domain *domain); +static void free_iommu(struct intel_iommu *iommu) +{ + struct dmar_domain *domain; + int i; + + if (!iommu) + return; + + i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap)); + for (; i < cap_ndoms(iommu->cap); ) { + domain = iommu->domains[i]; + clear_bit(i, iommu->domain_ids); + domain_exit(domain); + i = find_next_bit(iommu->domain_ids, + cap_ndoms(iommu->cap), i+1); + } + + if (iommu->gcmd & DMA_GCMD_TE) + iommu_disable_translation(iommu); + + if (iommu->irq) { + set_irq_data(iommu->irq, NULL); + /* This will mask the irq */ + free_irq(iommu->irq, iommu); + destroy_irq(iommu->irq); + } + + kfree(iommu->domains); + kfree(iommu->domain_ids); + + /* free context mapping */ + free_context_table(iommu); + + if (iommu->reg) + iounmap(iommu->reg); + kfree(iommu); +} + +static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) +{ + unsigned long num; + unsigned long ndomains; + struct dmar_domain *domain; + unsigned long flags; + + domain = alloc_domain_mem(); + if (!domain) + return NULL; + + ndomains = cap_ndoms(iommu->cap); + + spin_lock_irqsave(&iommu->lock, flags); + num = find_first_zero_bit(iommu->domain_ids, ndomains); + if (num >= ndomains) { + spin_unlock_irqrestore(&iommu->lock, flags); + free_domain_mem(domain); + printk(KERN_ERR "IOMMU: no free domain ids\n"); + return NULL; + } + + set_bit(num, iommu->domain_ids); + domain->id = num; + domain->iommu = iommu; + iommu->domains[num] = domain; + spin_unlock_irqrestore(&iommu->lock, flags); + + return domain; +} + +static void iommu_free_domain(struct dmar_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&domain->iommu->lock, flags); + clear_bit(domain->id, domain->iommu->domain_ids); + spin_unlock_irqrestore(&domain->iommu->lock, flags); +} + +static struct iova_domain reserved_iova_list; + +static void dmar_init_reserved_ranges(void) +{ + struct pci_dev *pdev = NULL; + struct iova *iova; + int i; + u64 addr, size; + + init_iova_domain(&reserved_iova_list); + + /* IOAPIC ranges shouldn't be accessed by DMA */ + iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START), + IOVA_PFN(IOAPIC_RANGE_END)); + if (!iova) + printk(KERN_ERR "Reserve IOAPIC range failed\n"); + + /* Reserve all PCI MMIO to avoid peer-to-peer access */ + for_each_pci_dev(pdev) { + struct resource *r; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + r = &pdev->resource[i]; + if (!r->flags || !(r->flags & IORESOURCE_MEM)) + continue; + addr = r->start; + addr &= PAGE_MASK_4K; + size = r->end - addr; + size = PAGE_ALIGN_4K(size); + iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr), + IOVA_PFN(size + addr) - 1); + if (!iova) + printk(KERN_ERR "Reserve iova failed\n"); + } + } + +} + +static void domain_reserve_special_ranges(struct dmar_domain *domain) +{ + copy_reserved_iova(&reserved_iova_list, &domain->iovad); +} + +static inline int guestwidth_to_adjustwidth(int gaw) +{ + int agaw; + int r = (gaw - 12) % 9; + + if (r == 0) + agaw = gaw; + else + agaw = gaw + 9 - r; + if (agaw > 64) + agaw = 64; + return agaw; +} + +static int domain_init(struct dmar_domain *domain, int guest_width) +{ + struct intel_iommu *iommu; + int adjust_width, agaw; + unsigned long sagaw; + + init_iova_domain(&domain->iovad); + spin_lock_init(&domain->mapping_lock); + + domain_reserve_special_ranges(domain); + + /* calculate AGAW */ + iommu = domain->iommu; + if (guest_width > cap_mgaw(iommu->cap)) + guest_width = cap_mgaw(iommu->cap); + domain->gaw = guest_width; + adjust_width = guestwidth_to_adjustwidth(guest_width); + agaw = width_to_agaw(adjust_width); + sagaw = cap_sagaw(iommu->cap); + if (!test_bit(agaw, &sagaw)) { + /* hardware doesn't support it, choose a bigger one */ + pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw); + agaw = find_next_bit(&sagaw, 5, agaw); + if (agaw >= 5) + return -ENODEV; + } + domain->agaw = agaw; + INIT_LIST_HEAD(&domain->devices); + + /* always allocate the top pgd */ + domain->pgd = (struct dma_pte *)alloc_pgtable_page(); + if (!domain->pgd) + return -ENOMEM; + __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K); + return 0; +} + +static void domain_exit(struct dmar_domain *domain) +{ + u64 end; + + /* Domain 0 is reserved, so dont process it */ + if (!domain) + return; + + domain_remove_dev_info(domain); + /* destroy iovas */ + put_iova_domain(&domain->iovad); + end = DOMAIN_MAX_ADDR(domain->gaw); + end = end & (~PAGE_MASK_4K); + + /* clear ptes */ + dma_pte_clear_range(domain, 0, end); + + /* free page tables */ + dma_pte_free_pagetable(domain, 0, end); + + iommu_free_domain(domain); + free_domain_mem(domain); +} + +static int domain_context_mapping_one(struct dmar_domain *domain, + u8 bus, u8 devfn) +{ + struct context_entry *context; + struct intel_iommu *iommu = domain->iommu; + unsigned long flags; + + pr_debug("Set context mapping for %02x:%02x.%d\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + BUG_ON(!domain->pgd); + context = device_to_context_entry(iommu, bus, devfn); + if (!context) + return -ENOMEM; + spin_lock_irqsave(&iommu->lock, flags); + if (context_present(*context)) { + spin_unlock_irqrestore(&iommu->lock, flags); + return 0; + } + + context_set_domain_id(*context, domain->id); + context_set_address_width(*context, domain->agaw); + context_set_address_root(*context, virt_to_phys(domain->pgd)); + context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); + context_set_fault_enable(*context); + context_set_present(*context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + + /* it's a non-present to present mapping */ + if (iommu_flush_context_device(iommu, domain->id, + (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1)) + iommu_flush_write_buffer(iommu); + else + iommu_flush_iotlb_dsi(iommu, 0, 0); + spin_unlock_irqrestore(&iommu->lock, flags); + return 0; +} + +static int +domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) +{ + int ret; + struct pci_dev *tmp, *parent; + + ret = domain_context_mapping_one(domain, pdev->bus->number, + pdev->devfn); + if (ret) + return ret; + + /* dependent device mapping */ + tmp = pci_find_upstream_pcie_bridge(pdev); + if (!tmp) + return 0; + /* Secondary interface's bus number and devfn 0 */ + parent = pdev->bus->self; + while (parent != tmp) { + ret = domain_context_mapping_one(domain, parent->bus->number, + parent->devfn); + if (ret) + return ret; + parent = parent->bus->self; + } + if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */ + return domain_context_mapping_one(domain, + tmp->subordinate->number, 0); + else /* this is a legacy PCI bridge */ + return domain_context_mapping_one(domain, + tmp->bus->number, tmp->devfn); +} + +static int domain_context_mapped(struct dmar_domain *domain, + struct pci_dev *pdev) +{ + int ret; + struct pci_dev *tmp, *parent; + + ret = device_context_mapped(domain->iommu, + pdev->bus->number, pdev->devfn); + if (!ret) + return ret; + /* dependent device mapping */ + tmp = pci_find_upstream_pcie_bridge(pdev); + if (!tmp) + return ret; + /* Secondary interface's bus number and devfn 0 */ + parent = pdev->bus->self; + while (parent != tmp) { + ret = device_context_mapped(domain->iommu, parent->bus->number, + parent->devfn); + if (!ret) + return ret; + parent = parent->bus->self; + } + if (tmp->is_pcie) + return device_context_mapped(domain->iommu, + tmp->subordinate->number, 0); + else + return device_context_mapped(domain->iommu, + tmp->bus->number, tmp->devfn); +} + +static int +domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, + u64 hpa, size_t size, int prot) +{ + u64 start_pfn, end_pfn; + struct dma_pte *pte; + int index; + + if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) + return -EINVAL; + iova &= PAGE_MASK_4K; + start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K; + end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K; + index = 0; + while (start_pfn < end_pfn) { + pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index); + if (!pte) + return -ENOMEM; + /* We don't need lock here, nobody else + * touches the iova range + */ + BUG_ON(dma_pte_addr(*pte)); + dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K); + dma_set_pte_prot(*pte, prot); + __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); + start_pfn++; + index++; + } + return 0; +} + +static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn) +{ + clear_context_table(domain->iommu, bus, devfn); + iommu_flush_context_global(domain->iommu, 0); + iommu_flush_iotlb_global(domain->iommu, 0); +} + +static void domain_remove_dev_info(struct dmar_domain *domain) +{ + struct device_domain_info *info; + unsigned long flags; + + spin_lock_irqsave(&device_domain_lock, flags); + while (!list_empty(&domain->devices)) { + info = list_entry(domain->devices.next, + struct device_domain_info, link); + list_del(&info->link); + list_del(&info->global); + if (info->dev) + info->dev->dev.archdata.iommu = NULL; + spin_unlock_irqrestore(&device_domain_lock, flags); + + detach_domain_for_dev(info->domain, info->bus, info->devfn); + free_devinfo_mem(info); + + spin_lock_irqsave(&device_domain_lock, flags); + } + spin_unlock_irqrestore(&device_domain_lock, flags); +} + +/* + * find_domain + * Note: we use struct pci_dev->dev.archdata.iommu stores the info + */ +struct dmar_domain * +find_domain(struct pci_dev *pdev) +{ + struct device_domain_info *info; + + /* No lock here, assumes no domain exit in normal case */ + info = pdev->dev.archdata.iommu; + if (info) + return info->domain; + return NULL; +} + +static int dmar_pci_device_match(struct pci_dev *devices[], int cnt, + struct pci_dev *dev) +{ + int index; + + while (dev) { + for (index = 0; index < cnt; index ++) + if (dev == devices[index]) + return 1; + + /* Check our parent */ + dev = dev->bus->self; + } + + return 0; +} + +static struct dmar_drhd_unit * +dmar_find_matched_drhd_unit(struct pci_dev *dev) +{ + struct dmar_drhd_unit *drhd = NULL; + + list_for_each_entry(drhd, &dmar_drhd_units, list) { + if (drhd->include_all || dmar_pci_device_match(drhd->devices, + drhd->devices_cnt, dev)) + return drhd; + } + + return NULL; +} + +/* domain is initialized */ +static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) +{ + struct dmar_domain *domain, *found = NULL; + struct intel_iommu *iommu; + struct dmar_drhd_unit *drhd; + struct device_domain_info *info, *tmp; + struct pci_dev *dev_tmp; + unsigned long flags; + int bus = 0, devfn = 0; + + domain = find_domain(pdev); + if (domain) + return domain; + + dev_tmp = pci_find_upstream_pcie_bridge(pdev); + if (dev_tmp) { + if (dev_tmp->is_pcie) { + bus = dev_tmp->subordinate->number; + devfn = 0; + } else { + bus = dev_tmp->bus->number; + devfn = dev_tmp->devfn; + } + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &device_domain_list, global) { + if (info->bus == bus && info->devfn == devfn) { + found = info->domain; + break; + } + } + spin_unlock_irqrestore(&device_domain_lock, flags); + /* pcie-pci bridge already has a domain, uses it */ + if (found) { + domain = found; + goto found_domain; + } + } + + /* Allocate new domain for the device */ + drhd = dmar_find_matched_drhd_unit(pdev); + if (!drhd) { + printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n", + pci_name(pdev)); + return NULL; + } + iommu = drhd->iommu; + + domain = iommu_alloc_domain(iommu); + if (!domain) + goto error; + + if (domain_init(domain, gaw)) { + domain_exit(domain); + goto error; + } + + /* register pcie-to-pci device */ + if (dev_tmp) { + info = alloc_devinfo_mem(); + if (!info) { + domain_exit(domain); + goto error; + } + info->bus = bus; + info->devfn = devfn; + info->dev = NULL; + info->domain = domain; + /* This domain is shared by devices under p2p bridge */ + domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES; + + /* pcie-to-pci bridge already has a domain, uses it */ + found = NULL; + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(tmp, &device_domain_list, global) { + if (tmp->bus == bus && tmp->devfn == devfn) { + found = tmp->domain; + break; + } + } + if (found) { + free_devinfo_mem(info); + domain_exit(domain); + domain = found; + } else { + list_add(&info->link, &domain->devices); + list_add(&info->global, &device_domain_list); + } + spin_unlock_irqrestore(&device_domain_lock, flags); + } + +found_domain: + info = alloc_devinfo_mem(); + if (!info) + goto error; + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->dev = pdev; + info->domain = domain; + spin_lock_irqsave(&device_domain_lock, flags); + /* somebody is fast */ + found = find_domain(pdev); + if (found != NULL) { + spin_unlock_irqrestore(&device_domain_lock, flags); + if (found != domain) { + domain_exit(domain); + domain = found; + } + free_devinfo_mem(info); + return domain; + } + list_add(&info->link, &domain->devices); + list_add(&info->global, &device_domain_list); + pdev->dev.archdata.iommu = info; + spin_unlock_irqrestore(&device_domain_lock, flags); + return domain; +error: + /* recheck it here, maybe others set it */ + return find_domain(pdev); +} + +static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end) +{ + struct dmar_domain *domain; + unsigned long size; + u64 base; + int ret; + + printk(KERN_INFO + "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + /* page table init */ + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); + if (!domain) + return -ENOMEM; + + /* The address might not be aligned */ + base = start & PAGE_MASK_4K; + size = end - base; + size = PAGE_ALIGN_4K(size); + if (!reserve_iova(&domain->iovad, IOVA_PFN(base), + IOVA_PFN(base + size) - 1)) { + printk(KERN_ERR "IOMMU: reserve iova failed\n"); + ret = -ENOMEM; + goto error; + } + + pr_debug("Mapping reserved region %lx@%llx for %s\n", + size, base, pci_name(pdev)); + /* + * RMRR range might have overlap with physical memory range, + * clear it first + */ + dma_pte_clear_range(domain, base, base + size); + + ret = domain_page_mapping(domain, base, base, size, + DMA_PTE_READ|DMA_PTE_WRITE); + if (ret) + goto error; + + /* context entry init */ + ret = domain_context_mapping(domain, pdev); + if (!ret) + return 0; +error: + domain_exit(domain); + return ret; + +} + +static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, + struct pci_dev *pdev) +{ + if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + return 0; + return iommu_prepare_identity_map(pdev, rmrr->base_address, + rmrr->end_address + 1); +} + +#ifdef CONFIG_DMAR_GFX_WA +extern int arch_get_ram_range(int slot, u64 *addr, u64 *size); +static void __init iommu_prepare_gfx_mapping(void) +{ + struct pci_dev *pdev = NULL; + u64 base, size; + int slot; + int ret; + + for_each_pci_dev(pdev) { + if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO || + !IS_GFX_DEVICE(pdev)) + continue; + printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", + pci_name(pdev)); + slot = arch_get_ram_range(0, &base, &size); + while (slot >= 0) { + ret = iommu_prepare_identity_map(pdev, + base, base + size); + if (ret) + goto error; + slot = arch_get_ram_range(slot, &base, &size); + } + continue; +error: + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + } +} +#endif + +#ifdef CONFIG_DMAR_FLOPPY_WA +static inline void iommu_prepare_isa(void) +{ + struct pci_dev *pdev; + int ret; + + pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); + if (!pdev) + return; + + printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n"); + ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); + + if (ret) + printk("IOMMU: Failed to create 0-64M identity map, " + "floppy might not work\n"); + +} +#else +static inline void iommu_prepare_isa(void) +{ + return; +} +#endif /* !CONFIG_DMAR_FLPY_WA */ + +int __init init_dmars(void) +{ + struct dmar_drhd_unit *drhd; + struct dmar_rmrr_unit *rmrr; + struct pci_dev *pdev; + struct intel_iommu *iommu; + int ret, unit = 0; + + /* + * for each drhd + * allocate root + * initialize and program root entry to not present + * endfor + */ + for_each_drhd_unit(drhd) { + if (drhd->ignored) + continue; + iommu = alloc_iommu(drhd); + if (!iommu) { + ret = -ENOMEM; + goto error; + } + + /* + * TBD: + * we could share the same root & context tables + * amoung all IOMMU's. Need to Split it later. + */ + ret = iommu_alloc_root_entry(iommu); + if (ret) { + printk(KERN_ERR "IOMMU: allocate root entry failed\n"); + goto error; + } + } + + /* + * For each rmrr + * for each dev attached to rmrr + * do + * locate drhd for dev, alloc domain for dev + * allocate free domain + * allocate page table entries for rmrr + * if context not allocated for bus + * allocate and init context + * set present in root table for this bus + * init context with domain, translation etc + * endfor + * endfor + */ + for_each_rmrr_units(rmrr) { + int i; + for (i = 0; i < rmrr->devices_cnt; i++) { + pdev = rmrr->devices[i]; + /* some BIOS lists non-exist devices in DMAR table */ + if (!pdev) + continue; + ret = iommu_prepare_rmrr_dev(rmrr, pdev); + if (ret) + printk(KERN_ERR + "IOMMU: mapping reserved region failed\n"); + } + } + + iommu_prepare_gfx_mapping(); + + iommu_prepare_isa(); + + /* + * for each drhd + * enable fault log + * global invalidate context cache + * global invalidate iotlb + * enable translation + */ + for_each_drhd_unit(drhd) { + if (drhd->ignored) + continue; + iommu = drhd->iommu; + sprintf (iommu->name, "dmar%d", unit++); + + iommu_flush_write_buffer(iommu); + + ret = dmar_set_interrupt(iommu); + if (ret) + goto error; + + iommu_set_root_entry(iommu); + + iommu_flush_context_global(iommu, 0); + iommu_flush_iotlb_global(iommu, 0); + + ret = iommu_enable_translation(iommu); + if (ret) + goto error; + } + + return 0; +error: + for_each_drhd_unit(drhd) { + if (drhd->ignored) + continue; + iommu = drhd->iommu; + free_iommu(iommu); + } + return ret; +} + +static inline u64 aligned_size(u64 host_addr, size_t size) +{ + u64 addr; + addr = (host_addr & (~PAGE_MASK_4K)) + size; + return PAGE_ALIGN_4K(addr); +} + +struct iova * +iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end) +{ + struct iova *piova; + + /* Make sure it's in range */ + end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end); + if (!size || (IOVA_START_ADDR + size > end)) + return NULL; + + piova = alloc_iova(&domain->iovad, + size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1); + return piova; +} + +static struct iova * +__intel_alloc_iova(struct device *dev, struct dmar_domain *domain, + size_t size) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct iova *iova = NULL; + + if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) { + iova = iommu_alloc_iova(domain, size, pdev->dma_mask); + } else { + /* + * First try to allocate an io virtual address in + * DMA_32BIT_MASK and if that fails then try allocating + * from higer range + */ + iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK); + if (!iova) + iova = iommu_alloc_iova(domain, size, pdev->dma_mask); + } + + if (!iova) { + printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev)); + return NULL; + } + + return iova; +} + +static struct dmar_domain * +get_valid_domain_for_dev(struct pci_dev *pdev) +{ + struct dmar_domain *domain; + int ret; + + domain = get_domain_for_dev(pdev, + DEFAULT_DOMAIN_ADDRESS_WIDTH); + if (!domain) { + printk(KERN_ERR + "Allocating domain for %s failed", pci_name(pdev)); + return 0; + } + + /* make sure context mapping is ok */ + if (unlikely(!domain_context_mapped(domain, pdev))) { + ret = domain_context_mapping(domain, pdev); + if (ret) { + printk(KERN_ERR + "Domain context map for %s failed", + pci_name(pdev)); + return 0; + } + } + + return domain; +} + +static dma_addr_t intel_map_single(struct device *hwdev, void *addr, + size_t size, int dir) +{ + struct pci_dev *pdev = to_pci_dev(hwdev); + int ret; + struct dmar_domain *domain; + unsigned long start_addr; + struct iova *iova; + int prot = 0; + + BUG_ON(dir == DMA_NONE); + if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + return virt_to_bus(addr); + + domain = get_valid_domain_for_dev(pdev); + if (!domain) + return 0; + + addr = (void *)virt_to_phys(addr); + size = aligned_size((u64)addr, size); + + iova = __intel_alloc_iova(hwdev, domain, size); + if (!iova) + goto error; + + start_addr = iova->pfn_lo << PAGE_SHIFT_4K; + + /* + * Check if DMAR supports zero-length reads on write only + * mappings.. + */ + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ + !cap_zlr(domain->iommu->cap)) + prot |= DMA_PTE_READ; + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + prot |= DMA_PTE_WRITE; + /* + * addr - (addr + size) might be partial page, we should map the whole + * page. Note: if two part of one page are separately mapped, we + * might have two guest_addr mapping to the same host addr, but this + * is not a big problem + */ + ret = domain_page_mapping(domain, start_addr, + ((u64)addr) & PAGE_MASK_4K, size, prot); + if (ret) + goto error; + + pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n", + pci_name(pdev), size, (u64)addr, + size, (u64)start_addr, dir); + + /* it's a non-present to present mapping */ + ret = iommu_flush_iotlb_psi(domain->iommu, domain->id, + start_addr, size >> PAGE_SHIFT_4K, 1); + if (ret) + iommu_flush_write_buffer(domain->iommu); + + return (start_addr + ((u64)addr & (~PAGE_MASK_4K))); + +error: + if (iova) + __free_iova(&domain->iovad, iova); + printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n", + pci_name(pdev), size, (u64)addr, dir); + return 0; +} + +static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, + size_t size, int dir) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct dmar_domain *domain; + unsigned long start_addr; + struct iova *iova; + + if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + return; + domain = find_domain(pdev); + BUG_ON(!domain); + + iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr)); + if (!iova) + return; + + start_addr = iova->pfn_lo << PAGE_SHIFT_4K; + size = aligned_size((u64)dev_addr, size); + + pr_debug("Device %s unmapping: %lx@%llx\n", + pci_name(pdev), size, (u64)start_addr); + + /* clear the whole page */ + dma_pte_clear_range(domain, start_addr, start_addr + size); + /* free page tables */ + dma_pte_free_pagetable(domain, start_addr, start_addr + size); + + if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr, + size >> PAGE_SHIFT_4K, 0)) + iommu_flush_write_buffer(domain->iommu); + + /* free iova */ + __free_iova(&domain->iovad, iova); +} + +static void * intel_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +{ + void *vaddr; + int order; + + size = PAGE_ALIGN_4K(size); + order = get_order(size); + flags &= ~(GFP_DMA | GFP_DMA32); + + vaddr = (void *)__get_free_pages(flags, order); + if (!vaddr) + return NULL; + memset(vaddr, 0, size); + + *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL); + if (*dma_handle) + return vaddr; + free_pages((unsigned long)vaddr, order); + return NULL; +} + +static void intel_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + int order; + + size = PAGE_ALIGN_4K(size); + order = get_order(size); + + intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL); + free_pages((unsigned long)vaddr, order); +} + +#define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset) +static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, + int nelems, int dir) +{ + int i; + struct pci_dev *pdev = to_pci_dev(hwdev); + struct dmar_domain *domain; + unsigned long start_addr; + struct iova *iova; + size_t size = 0; + void *addr; + struct scatterlist *sg; + + if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + return; + + domain = find_domain(pdev); + + iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address)); + if (!iova) + return; + for_each_sg(sglist, sg, nelems, i) { + addr = SG_ENT_VIRT_ADDRESS(sg); + size += aligned_size((u64)addr, sg->length); + } + + start_addr = iova->pfn_lo << PAGE_SHIFT_4K; + + /* clear the whole page */ + dma_pte_clear_range(domain, start_addr, start_addr + size); + /* free page tables */ + dma_pte_free_pagetable(domain, start_addr, start_addr + size); + + if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr, + size >> PAGE_SHIFT_4K, 0)) + iommu_flush_write_buffer(domain->iommu); + + /* free iova */ + __free_iova(&domain->iovad, iova); +} + +static int intel_nontranslate_map_sg(struct device *hddev, + struct scatterlist *sglist, int nelems, int dir) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) { + BUG_ON(!sg->page); + sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg)); + sg->dma_length = sg->length; + } + return nelems; +} + +static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, + int nelems, int dir) +{ + void *addr; + int i; + struct pci_dev *pdev = to_pci_dev(hwdev); + struct dmar_domain *domain; + size_t size = 0; + int prot = 0; + size_t offset = 0; + struct iova *iova = NULL; + int ret; + struct scatterlist *sg; + unsigned long start_addr; + + BUG_ON(dir == DMA_NONE); + if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir); + + domain = get_valid_domain_for_dev(pdev); + if (!domain) + return 0; + + for_each_sg(sglist, sg, nelems, i) { + addr = SG_ENT_VIRT_ADDRESS(sg); + addr = (void *)virt_to_phys(addr); + size += aligned_size((u64)addr, sg->length); + } + + iova = __intel_alloc_iova(hwdev, domain, size); + if (!iova) { + sglist->dma_length = 0; + return 0; + } + + /* + * Check if DMAR supports zero-length reads on write only + * mappings.. + */ + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ + !cap_zlr(domain->iommu->cap)) + prot |= DMA_PTE_READ; + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + prot |= DMA_PTE_WRITE; + + start_addr = iova->pfn_lo << PAGE_SHIFT_4K; + offset = 0; + for_each_sg(sglist, sg, nelems, i) { + addr = SG_ENT_VIRT_ADDRESS(sg); + addr = (void *)virt_to_phys(addr); + size = aligned_size((u64)addr, sg->length); + ret = domain_page_mapping(domain, start_addr + offset, + ((u64)addr) & PAGE_MASK_4K, + size, prot); + if (ret) { + /* clear the page */ + dma_pte_clear_range(domain, start_addr, + start_addr + offset); + /* free page tables */ + dma_pte_free_pagetable(domain, start_addr, + start_addr + offset); + /* free iova */ + __free_iova(&domain->iovad, iova); + return 0; + } + sg->dma_address = start_addr + offset + + ((u64)addr & (~PAGE_MASK_4K)); + sg->dma_length = sg->length; + offset += size; + } + + /* it's a non-present to present mapping */ + if (iommu_flush_iotlb_psi(domain->iommu, domain->id, + start_addr, offset >> PAGE_SHIFT_4K, 1)) + iommu_flush_write_buffer(domain->iommu); + return nelems; +} + +static struct dma_mapping_ops intel_dma_ops = { + .alloc_coherent = intel_alloc_coherent, + .free_coherent = intel_free_coherent, + .map_single = intel_map_single, + .unmap_single = intel_unmap_single, + .map_sg = intel_map_sg, + .unmap_sg = intel_unmap_sg, +}; + +static inline int iommu_domain_cache_init(void) +{ + int ret = 0; + + iommu_domain_cache = kmem_cache_create("iommu_domain", + sizeof(struct dmar_domain), + 0, + SLAB_HWCACHE_ALIGN, + + NULL); + if (!iommu_domain_cache) { + printk(KERN_ERR "Couldn't create iommu_domain cache\n"); + ret = -ENOMEM; + } + + return ret; +} + +static inline int iommu_devinfo_cache_init(void) +{ + int ret = 0; + + iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", + sizeof(struct device_domain_info), + 0, + SLAB_HWCACHE_ALIGN, + + NULL); + if (!iommu_devinfo_cache) { + printk(KERN_ERR "Couldn't create devinfo cache\n"); + ret = -ENOMEM; + } + + return ret; +} + +static inline int iommu_iova_cache_init(void) +{ + int ret = 0; + + iommu_iova_cache = kmem_cache_create("iommu_iova", + sizeof(struct iova), + 0, + SLAB_HWCACHE_ALIGN, + + NULL); + if (!iommu_iova_cache) { + printk(KERN_ERR "Couldn't create iova cache\n"); + ret = -ENOMEM; + } + + return ret; +} + +static int __init iommu_init_mempool(void) +{ + int ret; + ret = iommu_iova_cache_init(); + if (ret) + return ret; + + ret = iommu_domain_cache_init(); + if (ret) + goto domain_error; + + ret = iommu_devinfo_cache_init(); + if (!ret) + return ret; + + kmem_cache_destroy(iommu_domain_cache); +domain_error: + kmem_cache_destroy(iommu_iova_cache); + + return -ENOMEM; +} + +static void __init iommu_exit_mempool(void) +{ + kmem_cache_destroy(iommu_devinfo_cache); + kmem_cache_destroy(iommu_domain_cache); + kmem_cache_destroy(iommu_iova_cache); + +} + +void __init detect_intel_iommu(void) +{ + if (swiotlb || no_iommu || iommu_detected || dmar_disabled) + return; + if (early_dmar_detect()) { + iommu_detected = 1; + } +} + +static void __init init_no_remapping_devices(void) +{ + struct dmar_drhd_unit *drhd; + + for_each_drhd_unit(drhd) { + if (!drhd->include_all) { + int i; + for (i = 0; i < drhd->devices_cnt; i++) + if (drhd->devices[i] != NULL) + break; + /* ignore DMAR unit if no pci devices exist */ + if (i == drhd->devices_cnt) + drhd->ignored = 1; + } + } + + if (dmar_map_gfx) + return; + + for_each_drhd_unit(drhd) { + int i; + if (drhd->ignored || drhd->include_all) + continue; + + for (i = 0; i < drhd->devices_cnt; i++) + if (drhd->devices[i] && + !IS_GFX_DEVICE(drhd->devices[i])) + break; + + if (i < drhd->devices_cnt) + continue; + + /* bypass IOMMU if it is just for gfx devices */ + drhd->ignored = 1; + for (i = 0; i < drhd->devices_cnt; i++) { + if (!drhd->devices[i]) + continue; + drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO; + } + } +} + +int __init intel_iommu_init(void) +{ + int ret = 0; + + if (no_iommu || swiotlb || dmar_disabled) + return -ENODEV; + + if (dmar_table_init()) + return -ENODEV; + + iommu_init_mempool(); + dmar_init_reserved_ranges(); + + init_no_remapping_devices(); + + ret = init_dmars(); + if (ret) { + printk(KERN_ERR "IOMMU: dmar init failed\n"); + put_iova_domain(&reserved_iova_list); + iommu_exit_mempool(); + return ret; + } + printk(KERN_INFO + "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n"); + + force_iommu = 1; + dma_ops = &intel_dma_ops; + return 0; +} + diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h new file mode 100644 index 00000000000..ee88dd2400c --- /dev/null +++ b/drivers/pci/intel-iommu.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) Ashok Raj <ashok.raj@intel.com> + * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + */ + +#ifndef _INTEL_IOMMU_H_ +#define _INTEL_IOMMU_H_ + +#include <linux/types.h> +#include <linux/msi.h> +#include "iova.h" +#include <linux/io.h> + +/* + * Intel IOMMU register specification per version 1.0 public spec. + */ + +#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ +#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ +#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ +#define DMAR_GCMD_REG 0x18 /* Global command register */ +#define DMAR_GSTS_REG 0x1c /* Global status register */ +#define DMAR_RTADDR_REG 0x20 /* Root entry table */ +#define DMAR_CCMD_REG 0x28 /* Context command reg */ +#define DMAR_FSTS_REG 0x34 /* Fault Status register */ +#define DMAR_FECTL_REG 0x38 /* Fault control register */ +#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ +#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ +#define DMAR_FEUADDR_REG 0x44 /* Upper address register */ +#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ +#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ +#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ +#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ +#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ +#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ + +#define OFFSET_STRIDE (9) +/* +#define dmar_readl(dmar, reg) readl(dmar + reg) +#define dmar_readq(dmar, reg) ({ \ + u32 lo, hi; \ + lo = readl(dmar + reg); \ + hi = readl(dmar + reg + 4); \ + (((u64) hi) << 32) + lo; }) +*/ +static inline u64 dmar_readq(void *addr) +{ + u32 lo, hi; + lo = readl(addr); + hi = readl(addr + 4); + return (((u64) hi) << 32) + lo; +} + +static inline void dmar_writeq(void __iomem *addr, u64 val) +{ + writel((u32)val, addr); + writel((u32)(val >> 32), addr + 4); +} + +#define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) +#define DMAR_VER_MINOR(v) ((v) & 0x0f) + +/* + * Decoding Capability Register + */ +#define cap_read_drain(c) (((c) >> 55) & 1) +#define cap_write_drain(c) (((c) >> 54) & 1) +#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) +#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) +#define cap_pgsel_inv(c) (((c) >> 39) & 1) + +#define cap_super_page_val(c) (((c) >> 34) & 0xf) +#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ + * OFFSET_STRIDE) + 21) + +#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) +#define cap_max_fault_reg_offset(c) \ + (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16) + +#define cap_zlr(c) (((c) >> 22) & 1) +#define cap_isoch(c) (((c) >> 23) & 1) +#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) +#define cap_sagaw(c) (((c) >> 8) & 0x1f) +#define cap_caching_mode(c) (((c) >> 7) & 1) +#define cap_phmr(c) (((c) >> 6) & 1) +#define cap_plmr(c) (((c) >> 5) & 1) +#define cap_rwbf(c) (((c) >> 4) & 1) +#define cap_afl(c) (((c) >> 3) & 1) +#define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7))) +/* + * Extended Capability Register + */ + +#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1) +#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) +#define ecap_max_iotlb_offset(e) \ + (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16) +#define ecap_coherent(e) ((e) & 0x1) + + +/* IOTLB_REG */ +#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) +#define DMA_TLB_DSI_FLUSH (((u64)2) << 60) +#define DMA_TLB_PSI_FLUSH (((u64)3) << 60) +#define DMA_TLB_IIRG(type) ((type >> 60) & 7) +#define DMA_TLB_IAIG(val) (((val) >> 57) & 7) +#define DMA_TLB_READ_DRAIN (((u64)1) << 49) +#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) +#define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32) +#define DMA_TLB_IVT (((u64)1) << 63) +#define DMA_TLB_IH_NONLEAF (((u64)1) << 6) +#define DMA_TLB_MAX_SIZE (0x3f) + +/* GCMD_REG */ +#define DMA_GCMD_TE (((u32)1) << 31) +#define DMA_GCMD_SRTP (((u32)1) << 30) +#define DMA_GCMD_SFL (((u32)1) << 29) +#define DMA_GCMD_EAFL (((u32)1) << 28) +#define DMA_GCMD_WBF (((u32)1) << 27) + +/* GSTS_REG */ +#define DMA_GSTS_TES (((u32)1) << 31) +#define DMA_GSTS_RTPS (((u32)1) << 30) +#define DMA_GSTS_FLS (((u32)1) << 29) +#define DMA_GSTS_AFLS (((u32)1) << 28) +#define DMA_GSTS_WBFS (((u32)1) << 27) + +/* CCMD_REG */ +#define DMA_CCMD_ICC (((u64)1) << 63) +#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) +#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) +#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) +#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) +#define DMA_CCMD_MASK_NOBIT 0 +#define DMA_CCMD_MASK_1BIT 1 +#define DMA_CCMD_MASK_2BIT 2 +#define DMA_CCMD_MASK_3BIT 3 +#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) +#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) + +/* FECTL_REG */ +#define DMA_FECTL_IM (((u32)1) << 31) + +/* FSTS_REG */ +#define DMA_FSTS_PPF ((u32)2) +#define DMA_FSTS_PFO ((u32)1) +#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) + +/* FRCD_REG, 32 bits access */ +#define DMA_FRCD_F (((u32)1) << 31) +#define dma_frcd_type(d) ((d >> 30) & 1) +#define dma_frcd_fault_reason(c) (c & 0xff) +#define dma_frcd_source_id(c) (c & 0xffff) +#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */ + +/* + * 0: Present + * 1-11: Reserved + * 12-63: Context Ptr (12 - (haw-1)) + * 64-127: Reserved + */ +struct root_entry { + u64 val; + u64 rsvd1; +}; +#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) +static inline bool root_present(struct root_entry *root) +{ + return (root->val & 1); +} +static inline void set_root_present(struct root_entry *root) +{ + root->val |= 1; +} +static inline void set_root_value(struct root_entry *root, unsigned long value) +{ + root->val |= value & PAGE_MASK_4K; +} + +struct context_entry; +static inline struct context_entry * +get_context_addr_from_root(struct root_entry *root) +{ + return (struct context_entry *) + (root_present(root)?phys_to_virt( + root->val & PAGE_MASK_4K): + NULL); +} + +/* + * low 64 bits: + * 0: present + * 1: fault processing disable + * 2-3: translation type + * 12-63: address space root + * high 64 bits: + * 0-2: address width + * 3-6: aval + * 8-23: domain id + */ +struct context_entry { + u64 lo; + u64 hi; +}; +#define context_present(c) ((c).lo & 1) +#define context_fault_disable(c) (((c).lo >> 1) & 1) +#define context_translation_type(c) (((c).lo >> 2) & 3) +#define context_address_root(c) ((c).lo & PAGE_MASK_4K) +#define context_address_width(c) ((c).hi & 7) +#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1)) + +#define context_set_present(c) do {(c).lo |= 1;} while (0) +#define context_set_fault_enable(c) \ + do {(c).lo &= (((u64)-1) << 2) | 1;} while (0) +#define context_set_translation_type(c, val) \ + do { \ + (c).lo &= (((u64)-1) << 4) | 3; \ + (c).lo |= ((val) & 3) << 2; \ + } while (0) +#define CONTEXT_TT_MULTI_LEVEL 0 +#define context_set_address_root(c, val) \ + do {(c).lo |= (val) & PAGE_MASK_4K;} while (0) +#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0) +#define context_set_domain_id(c, val) \ + do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0) +#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0) + +/* + * 0: readable + * 1: writable + * 2-6: reserved + * 7: super page + * 8-11: available + * 12-63: Host physcial address + */ +struct dma_pte { + u64 val; +}; +#define dma_clear_pte(p) do {(p).val = 0;} while (0) + +#define DMA_PTE_READ (1) +#define DMA_PTE_WRITE (2) + +#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0) +#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0) +#define dma_set_pte_prot(p, prot) \ + do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) +#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) +#define dma_set_pte_addr(p, addr) do {\ + (p).val |= ((addr) & PAGE_MASK_4K); } while (0) +#define dma_pte_present(p) (((p).val & 3) != 0) + +struct intel_iommu; + +struct dmar_domain { + int id; /* domain id */ + struct intel_iommu *iommu; /* back pointer to owning iommu */ + + struct list_head devices; /* all devices' list */ + struct iova_domain iovad; /* iova's that belong to this domain */ + + struct dma_pte *pgd; /* virtual address */ + spinlock_t mapping_lock; /* page table lock */ + int gaw; /* max guest address width */ + + /* adjusted guest address width, 0 is level 2 30-bit */ + int agaw; + +#define DOMAIN_FLAG_MULTIPLE_DEVICES 1 + int flags; +}; + +/* PCI domain-device relationship */ +struct device_domain_info { + struct list_head link; /* link to domain siblings */ + struct list_head global; /* link to global list */ + u8 bus; /* PCI bus numer */ + u8 devfn; /* PCI devfn number */ + struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ + struct dmar_domain *domain; /* pointer to domain */ +}; + +extern int init_dmars(void); + +struct intel_iommu { + void __iomem *reg; /* Pointer to hardware regs, virtual addr */ + u64 cap; + u64 ecap; + unsigned long *domain_ids; /* bitmap of domains */ + struct dmar_domain **domains; /* ptr to domains */ + int seg; + u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ + spinlock_t lock; /* protect context, domain ids */ + spinlock_t register_lock; /* protect register handling */ + struct root_entry *root_entry; /* virtual address */ + + unsigned int irq; + unsigned char name[7]; /* Device Name */ + struct msi_msg saved_msg; + struct sys_device sysdev; +}; + +#ifndef CONFIG_DMAR_GFX_WA +static inline void iommu_prepare_gfx_mapping(void) +{ + return; +} +#endif /* !CONFIG_DMAR_GFX_WA */ + +#endif diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c new file mode 100644 index 00000000000..a84571c2936 --- /dev/null +++ b/drivers/pci/iova.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This file is released under the GPLv2. + * + * Copyright (C) 2006 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + */ + +#include "iova.h" + +void +init_iova_domain(struct iova_domain *iovad) +{ + spin_lock_init(&iovad->iova_alloc_lock); + spin_lock_init(&iovad->iova_rbtree_lock); + iovad->rbroot = RB_ROOT; + iovad->cached32_node = NULL; + +} + +static struct rb_node * +__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn) +{ + if ((*limit_pfn != DMA_32BIT_PFN) || + (iovad->cached32_node == NULL)) + return rb_last(&iovad->rbroot); + else { + struct rb_node *prev_node = rb_prev(iovad->cached32_node); + struct iova *curr_iova = + container_of(iovad->cached32_node, struct iova, node); + *limit_pfn = curr_iova->pfn_lo - 1; + return prev_node; + } +} + +static void +__cached_rbnode_insert_update(struct iova_domain *iovad, + unsigned long limit_pfn, struct iova *new) +{ + if (limit_pfn != DMA_32BIT_PFN) + return; + iovad->cached32_node = &new->node; +} + +static void +__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free) +{ + struct iova *cached_iova; + struct rb_node *curr; + + if (!iovad->cached32_node) + return; + curr = iovad->cached32_node; + cached_iova = container_of(curr, struct iova, node); + + if (free->pfn_lo >= cached_iova->pfn_lo) + iovad->cached32_node = rb_next(&free->node); +} + +/* Computes the padding size required, to make the + * the start address naturally aligned on its size + */ +static int +iova_get_pad_size(int size, unsigned int limit_pfn) +{ + unsigned int pad_size = 0; + unsigned int order = ilog2(size); + + if (order) + pad_size = (limit_pfn + 1) % (1 << order); + + return pad_size; +} + +static int __alloc_iova_range(struct iova_domain *iovad, unsigned long size, + unsigned long limit_pfn, struct iova *new, bool size_aligned) +{ + struct rb_node *curr = NULL; + unsigned long flags; + unsigned long saved_pfn; + unsigned int pad_size = 0; + + /* Walk the tree backwards */ + spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); + saved_pfn = limit_pfn; + curr = __get_cached_rbnode(iovad, &limit_pfn); + while (curr) { + struct iova *curr_iova = container_of(curr, struct iova, node); + if (limit_pfn < curr_iova->pfn_lo) + goto move_left; + else if (limit_pfn < curr_iova->pfn_hi) + goto adjust_limit_pfn; + else { + if (size_aligned) + pad_size = iova_get_pad_size(size, limit_pfn); + if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn) + break; /* found a free slot */ + } +adjust_limit_pfn: + limit_pfn = curr_iova->pfn_lo - 1; +move_left: + curr = rb_prev(curr); + } + + if (!curr) { + if (size_aligned) + pad_size = iova_get_pad_size(size, limit_pfn); + if ((IOVA_START_PFN + size + pad_size) > limit_pfn) { + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + return -ENOMEM; + } + } + + /* pfn_lo will point to size aligned address if size_aligned is set */ + new->pfn_lo = limit_pfn - (size + pad_size) + 1; + new->pfn_hi = new->pfn_lo + size - 1; + + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + return 0; +} + +static void +iova_insert_rbtree(struct rb_root *root, struct iova *iova) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + /* Figure out where to put new node */ + while (*new) { + struct iova *this = container_of(*new, struct iova, node); + parent = *new; + + if (iova->pfn_lo < this->pfn_lo) + new = &((*new)->rb_left); + else if (iova->pfn_lo > this->pfn_lo) + new = &((*new)->rb_right); + else + BUG(); /* this should not happen */ + } + /* Add new node and rebalance tree. */ + rb_link_node(&iova->node, parent, new); + rb_insert_color(&iova->node, root); +} + +/** + * alloc_iova - allocates an iova + * @iovad - iova domain in question + * @size - size of page frames to allocate + * @limit_pfn - max limit address + * @size_aligned - set if size_aligned address range is required + * This function allocates an iova in the range limit_pfn to IOVA_START_PFN + * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned + * flag is set then the allocated address iova->pfn_lo will be naturally + * aligned on roundup_power_of_two(size). + */ +struct iova * +alloc_iova(struct iova_domain *iovad, unsigned long size, + unsigned long limit_pfn, + bool size_aligned) +{ + unsigned long flags; + struct iova *new_iova; + int ret; + + new_iova = alloc_iova_mem(); + if (!new_iova) + return NULL; + + /* If size aligned is set then round the size to + * to next power of two. + */ + if (size_aligned) + size = __roundup_pow_of_two(size); + + spin_lock_irqsave(&iovad->iova_alloc_lock, flags); + ret = __alloc_iova_range(iovad, size, limit_pfn, new_iova, + size_aligned); + + if (ret) { + spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags); + free_iova_mem(new_iova); + return NULL; + } + + /* Insert the new_iova into domain rbtree by holding writer lock */ + spin_lock(&iovad->iova_rbtree_lock); + iova_insert_rbtree(&iovad->rbroot, new_iova); + __cached_rbnode_insert_update(iovad, limit_pfn, new_iova); + spin_unlock(&iovad->iova_rbtree_lock); + + spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags); + + return new_iova; +} + +/** + * find_iova - find's an iova for a given pfn + * @iovad - iova domain in question. + * pfn - page frame number + * This function finds and returns an iova belonging to the + * given doamin which matches the given pfn. + */ +struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn) +{ + unsigned long flags; + struct rb_node *node; + + /* Take the lock so that no other thread is manipulating the rbtree */ + spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); + node = iovad->rbroot.rb_node; + while (node) { + struct iova *iova = container_of(node, struct iova, node); + + /* If pfn falls within iova's range, return iova */ + if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) { + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + /* We are not holding the lock while this iova + * is referenced by the caller as the same thread + * which called this function also calls __free_iova() + * and it is by desing that only one thread can possibly + * reference a particular iova and hence no conflict. + */ + return iova; + } + + if (pfn < iova->pfn_lo) + node = node->rb_left; + else if (pfn > iova->pfn_lo) + node = node->rb_right; + } + + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + return NULL; +} + +/** + * __free_iova - frees the given iova + * @iovad: iova domain in question. + * @iova: iova in question. + * Frees the given iova belonging to the giving domain + */ +void +__free_iova(struct iova_domain *iovad, struct iova *iova) +{ + unsigned long flags; + + spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); + __cached_rbnode_delete_update(iovad, iova); + rb_erase(&iova->node, &iovad->rbroot); + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + free_iova_mem(iova); +} + +/** + * free_iova - finds and frees the iova for a given pfn + * @iovad: - iova domain in question. + * @pfn: - pfn that is allocated previously + * This functions finds an iova for a given pfn and then + * frees the iova from that domain. + */ +void +free_iova(struct iova_domain *iovad, unsigned long pfn) +{ + struct iova *iova = find_iova(iovad, pfn); + if (iova) + __free_iova(iovad, iova); + +} + +/** + * put_iova_domain - destroys the iova doamin + * @iovad: - iova domain in question. + * All the iova's in that domain are destroyed. + */ +void put_iova_domain(struct iova_domain *iovad) +{ + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); + node = rb_first(&iovad->rbroot); + while (node) { + struct iova *iova = container_of(node, struct iova, node); + rb_erase(node, &iovad->rbroot); + free_iova_mem(iova); + node = rb_first(&iovad->rbroot); + } + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); +} + +static int +__is_range_overlap(struct rb_node *node, + unsigned long pfn_lo, unsigned long pfn_hi) +{ + struct iova *iova = container_of(node, struct iova, node); + + if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo)) + return 1; + return 0; +} + +static struct iova * +__insert_new_range(struct iova_domain *iovad, + unsigned long pfn_lo, unsigned long pfn_hi) +{ + struct iova *iova; + + iova = alloc_iova_mem(); + if (!iova) + return iova; + + iova->pfn_hi = pfn_hi; + iova->pfn_lo = pfn_lo; + iova_insert_rbtree(&iovad->rbroot, iova); + return iova; +} + +static void +__adjust_overlap_range(struct iova *iova, + unsigned long *pfn_lo, unsigned long *pfn_hi) +{ + if (*pfn_lo < iova->pfn_lo) + iova->pfn_lo = *pfn_lo; + if (*pfn_hi > iova->pfn_hi) + *pfn_lo = iova->pfn_hi + 1; +} + +/** + * reserve_iova - reserves an iova in the given range + * @iovad: - iova domain pointer + * @pfn_lo: - lower page frame address + * @pfn_hi:- higher pfn adderss + * This function allocates reserves the address range from pfn_lo to pfn_hi so + * that this address is not dished out as part of alloc_iova. + */ +struct iova * +reserve_iova(struct iova_domain *iovad, + unsigned long pfn_lo, unsigned long pfn_hi) +{ + struct rb_node *node; + unsigned long flags; + struct iova *iova; + unsigned int overlap = 0; + + spin_lock_irqsave(&iovad->iova_alloc_lock, flags); + spin_lock(&iovad->iova_rbtree_lock); + for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) { + if (__is_range_overlap(node, pfn_lo, pfn_hi)) { + iova = container_of(node, struct iova, node); + __adjust_overlap_range(iova, &pfn_lo, &pfn_hi); + if ((pfn_lo >= iova->pfn_lo) && + (pfn_hi <= iova->pfn_hi)) + goto finish; + overlap = 1; + + } else if (overlap) + break; + } + + /* We are here either becasue this is the first reserver node + * or need to insert remaining non overlap addr range + */ + iova = __insert_new_range(iovad, pfn_lo, pfn_hi); +finish: + + spin_unlock(&iovad->iova_rbtree_lock); + spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags); + return iova; +} + +/** + * copy_reserved_iova - copies the reserved between domains + * @from: - source doamin from where to copy + * @to: - destination domin where to copy + * This function copies reserved iova's from one doamin to + * other. + */ +void +copy_reserved_iova(struct iova_domain *from, struct iova_domain *to) +{ + unsigned long flags; + struct rb_node *node; + + spin_lock_irqsave(&from->iova_alloc_lock, flags); + spin_lock(&from->iova_rbtree_lock); + for (node = rb_first(&from->rbroot); node; node = rb_next(node)) { + struct iova *iova = container_of(node, struct iova, node); + struct iova *new_iova; + new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi); + if (!new_iova) + printk(KERN_ERR "Reserve iova range %lx@%lx failed\n", + iova->pfn_lo, iova->pfn_lo); + } + spin_unlock(&from->iova_rbtree_lock); + spin_unlock_irqrestore(&from->iova_alloc_lock, flags); +} diff --git a/drivers/pci/iova.h b/drivers/pci/iova.h new file mode 100644 index 00000000000..ae3028d5a94 --- /dev/null +++ b/drivers/pci/iova.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This file is released under the GPLv2. + * + * Copyright (C) 2006 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + * + */ + +#ifndef _IOVA_H_ +#define _IOVA_H_ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/rbtree.h> +#include <linux/dma-mapping.h> + +/* + * We need a fixed PAGE_SIZE of 4K irrespective of + * arch PAGE_SIZE for IOMMU page tables. + */ +#define PAGE_SHIFT_4K (12) +#define PAGE_SIZE_4K (1UL << PAGE_SHIFT_4K) +#define PAGE_MASK_4K (((u64)-1) << PAGE_SHIFT_4K) +#define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K) + +/* IO virtual address start page frame number */ +#define IOVA_START_PFN (1) + +#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT_4K) +#define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK) +#define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK) + +/* iova structure */ +struct iova { + struct rb_node node; + unsigned long pfn_hi; /* IOMMU dish out addr hi */ + unsigned long pfn_lo; /* IOMMU dish out addr lo */ +}; + +/* holds all the iova translations for a domain */ +struct iova_domain { + spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */ + spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */ + struct rb_root rbroot; /* iova domain rbtree root */ + struct rb_node *cached32_node; /* Save last alloced node */ +}; + +struct iova *alloc_iova_mem(void); +void free_iova_mem(struct iova *iova); +void free_iova(struct iova_domain *iovad, unsigned long pfn); +void __free_iova(struct iova_domain *iovad, struct iova *iova); +struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size, + unsigned long limit_pfn, + bool size_aligned); +struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, + unsigned long pfn_hi); +void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to); +void init_iova_domain(struct iova_domain *iovad); +struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); +void put_iova_domain(struct iova_domain *iovad); + +#endif diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 6fda33de84e..fc87e14b50d 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -90,3 +90,4 @@ pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) return NULL; } +struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 5db6b6690b5..463a5a9d583 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -837,6 +837,19 @@ static void pci_release_dev(struct device *dev) kfree(pci_dev); } +static void set_pcie_port_type(struct pci_dev *pdev) +{ + int pos; + u16 reg16; + + pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (!pos) + return; + pdev->is_pcie = 1; + pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); + pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; +} + /** * pci_cfg_space_size - get the configuration space size of the PCI device. * @dev: PCI device @@ -951,6 +964,7 @@ pci_scan_device(struct pci_bus *bus, int devfn) dev->device = (l >> 16) & 0xffff; dev->cfg_size = pci_cfg_space_size(dev); dev->error_state = pci_channel_io_normal; + set_pcie_port_type(dev); /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) set this higher, assuming the system even supports it. */ diff --git a/drivers/pci/search.c b/drivers/pci/search.c index c6e79d01ce3..b001b5922e3 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -14,6 +14,40 @@ #include "pci.h" DECLARE_RWSEM(pci_bus_sem); +/* + * find the upstream PCIE-to-PCI bridge of a PCI device + * if the device is PCIE, return NULL + * if the device isn't connected to a PCIE bridge (that is its parent is a + * legacy PCI bridge and the bridge is directly connected to bus 0), return its + * parent + */ +struct pci_dev * +pci_find_upstream_pcie_bridge(struct pci_dev *pdev) +{ + struct pci_dev *tmp = NULL; + + if (pdev->is_pcie) + return NULL; + while (1) { + if (!pdev->bus->self) + break; + pdev = pdev->bus->self; + /* a p2p bridge */ + if (!pdev->is_pcie) { + tmp = pdev; + continue; + } + /* PCI device should connect to a PCIE bridge */ + if (pdev->pcie_type != PCI_EXP_TYPE_PCI_BRIDGE) { + /* Busted hardware? */ + WARN_ON_ONCE(1); + return NULL; + } + return pdev; + } + + return tmp; +} static struct pci_bus *pci_do_find_bus(struct pci_bus *bus, unsigned char busnr) { |