summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig83
-rw-r--r--arch/x86/ia32/ia32entry.S10
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h54
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h580
-rw-r--r--arch/x86/include/asm/apb_timer.h1
-rw-r--r--arch/x86/include/asm/asm.h5
-rw-r--r--arch/x86/include/asm/calling.h130
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/frame.h11
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/i8253.h20
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/irqflags.h11
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/include/asm/mce.h19
-rw-r--r--arch/x86/include/asm/msr-index.h3
-rw-r--r--arch/x86/include/asm/percpu.h11
-rw-r--r--arch/x86/include/asm/perf_event.h5
-rw-r--r--arch/x86/include/asm/perf_event_p4.h33
-rw-r--r--arch/x86/include/asm/prom.h11
-rw-r--r--arch/x86/include/asm/rwlock.h43
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h8
-rw-r--r--arch/x86/include/asm/spinlock.h37
-rw-r--r--arch/x86/include/asm/spinlock_types.h6
-rw-r--r--arch/x86/include/asm/time.h6
-rw-r--r--arch/x86/include/asm/uaccess.h3
-rw-r--r--arch/x86/include/asm/xen/pci.h5
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/amd_iommu.c2764
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1572
-rw-r--r--arch/x86/kernel/apb_timer.c1
-rw-r--r--arch/x86/kernel/apic/apic.c5
-rw-r--r--arch/x86/kernel/apic/io_apic.c91
-rw-r--r--arch/x86/kernel/apm_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c152
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c288
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c182
-rw-r--r--arch/x86/kernel/cpu/perf_event.c168
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c14
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c385
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c119
-rw-r--r--arch/x86/kernel/devicetree.c60
-rw-r--r--arch/x86/kernel/dumpstack_64.c37
-rw-r--r--arch/x86/kernel/entry_64.S80
-rw-r--r--arch/x86/kernel/hpet.c3
-rw-r--r--arch/x86/kernel/i8253.c99
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c21
-rw-r--r--arch/x86/kernel/ptrace.c5
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/lguest/boot.c36
-rw-r--r--arch/x86/lguest/i386_head.S35
-rw-r--r--arch/x86/lib/Makefile9
-rw-r--r--arch/x86/lib/rwlock.S44
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/rwsem.S (renamed from arch/x86/lib/rwsem_64.S)75
-rw-r--r--arch/x86/lib/semaphore_32.S124
-rw-r--r--arch/x86/lib/thunk_64.S45
-rw-r--r--arch/x86/lib/usercopy.c43
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/oprofile/backtrace.c21
-rw-r--r--arch/x86/pci/xen.c371
-rw-r--r--arch/x86/platform/efi/efi.c90
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c8
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/vga.c67
-rw-r--r--arch/x86/xen/xen-ops.h11
82 files changed, 1748 insertions, 6556 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da349723d41..fc76e420900 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -20,6 +20,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
+ select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
@@ -70,6 +71,7 @@ config X86
select IRQ_FORCED_THREADING
select USE_GENERIC_SMP_HELPERS if SMP
select HAVE_BPF_JIT if (X86_64 && NET)
+ select CLKEVT_I8253
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -680,33 +682,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
Calgary anyway, pass 'iommu=calgary' on the kernel command line.
If unsure, say Y.
-config AMD_IOMMU
- bool "AMD IOMMU support"
- select SWIOTLB
- select PCI_MSI
- select PCI_IOV
- depends on X86_64 && PCI && ACPI
- ---help---
- With this option you can enable support for AMD IOMMU hardware in
- your system. An IOMMU is a hardware component which provides
- remapping of DMA memory accesses from devices. With an AMD IOMMU you
- can isolate the the DMA memory of different devices and protect the
- system from misbehaving device drivers or hardware.
-
- You can find out if your system has an AMD IOMMU if you look into
- your BIOS for an option to enable it or if you have an IVRS ACPI
- table.
-
-config AMD_IOMMU_STATS
- bool "Export AMD IOMMU statistics to debugfs"
- depends on AMD_IOMMU
- select DEBUG_FS
- ---help---
- This option enables code in the AMD IOMMU driver to collect various
- statistics about whats happening in the driver and exports that
- information to userspace via debugfs.
- If unsure, say N.
-
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
def_bool y if X86_64
@@ -720,9 +695,6 @@ config SWIOTLB
config IOMMU_HELPER
def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
-config IOMMU_API
- def_bool (AMD_IOMMU || DMAR)
-
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
@@ -1170,7 +1142,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
config AMD_NUMA
def_bool y
prompt "Old style AMD Opteron NUMA detection"
- depends on NUMA && PCI
+ depends on X86_64 && NUMA && PCI
---help---
Enable AMD NUMA node topology detection. You should say Y here if
you have a multi processor AMD system. This uses an old method to
@@ -1942,55 +1914,6 @@ config PCI_CNB20LE_QUIRK
You should say N unless you know you need this.
-config DMAR
- bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
- depends on PCI_MSI && ACPI && EXPERIMENTAL
- help
- DMA remapping (DMAR) devices support enables independent address
- translations for Direct Memory Access (DMA) from devices.
- These DMA remapping devices are reported via ACPI tables
- and include PCI device scope covered by these DMA
- remapping devices.
-
-config DMAR_DEFAULT_ON
- def_bool y
- prompt "Enable DMA Remapping Devices by default"
- depends on DMAR
- help
- Selecting this option will enable a DMAR device at boot time if
- one is found. If this option is not selected, DMAR support can
- be enabled by passing intel_iommu=on to the kernel. It is
- recommended you say N here while the DMAR code remains
- experimental.
-
-config DMAR_BROKEN_GFX_WA
- bool "Workaround broken graphics drivers (going away soon)"
- depends on DMAR && BROKEN
- ---help---
- Current Graphics drivers tend to use physical address
- for DMA and avoid using DMA APIs. Setting this config
- option permits the IOMMU driver to set a unity map for
- all the OS-visible memory. Hence the driver can continue
- to use physical addresses for DMA, at least until this
- option is removed in the 2.6.32 kernel.
-
-config DMAR_FLOPPY_WA
- def_bool y
- depends on DMAR
- ---help---
- Floppy disk drivers are known to bypass DMA API calls
- thereby failing to work when IOMMU is enabled. This
- workaround will setup a 1:1 mapping for the first
- 16MiB to make floppy (an ISA device) work.
-
-config INTR_REMAP
- bool "Support for Interrupt Remapping (EXPERIMENTAL)"
- depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
- ---help---
- Supports Interrupt remapping for IO-APIC and MSI devices.
- To use x2apic mode in the CPU's which support x2APIC enhancements or
- to support platforms with CPU's having > 8 bit APIC ID, say Y.
-
source "drivers/pci/pcie/Kconfig"
source "drivers/pci/Kconfig"
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index c1870dddd32..a0e866d233e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -143,7 +143,7 @@ ENTRY(ia32_sysenter_target)
CFI_REL_OFFSET rip,0
pushq_cfi %rax
cld
- SAVE_ARGS 0,0,1
+ SAVE_ARGS 0,1,0
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
1: movl (%rbp),%ebp
@@ -173,7 +173,7 @@ sysexit_from_sys_call:
andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
- RESTORE_ARGS 1,24,1,1,1,1
+ RESTORE_ARGS 0,24,0,0,0,0
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
@@ -289,7 +289,7 @@ ENTRY(ia32_cstar_target)
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1,1
+ SAVE_ARGS 8,0,0
movl %eax,%eax /* zero extension */
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -328,7 +328,7 @@ cstar_dispatch:
jnz sysretl_audit
sysretl_from_sys_call:
andl $~TS_COMPAT,TI_status(%r10)
- RESTORE_ARGS 1,-ARG_SKIP,1,1,1
+ RESTORE_ARGS 0,-ARG_SKIP,0,0,0
movl RIP-ARGOFFSET(%rsp),%ecx
CFI_REGISTER rip,rcx
movl EFLAGS-ARGOFFSET(%rsp),%r11d
@@ -419,7 +419,7 @@ ENTRY(ia32_syscall)
cld
/* note the registers are not zero extended to the sf.
this could be a problem. */
- SAVE_ARGS 0,0,1
+ SAVE_ARGS 0,1,0
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
deleted file mode 100644
index a6863a2dec1..00000000000
--- a/arch/x86/include/asm/amd_iommu.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_H
-#define _ASM_X86_AMD_IOMMU_H
-
-#include <linux/irqreturn.h>
-
-#ifdef CONFIG_AMD_IOMMU
-
-extern int amd_iommu_detect(void);
-
-#else
-
-static inline int amd_iommu_detect(void) { return -ENODEV; }
-
-#endif
-
-#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
deleted file mode 100644
index 55d95eb789b..00000000000
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
-#define _ASM_X86_AMD_IOMMU_PROTO_H
-
-#include <asm/amd_iommu_types.h>
-
-extern int amd_iommu_init_dma_ops(void);
-extern int amd_iommu_init_passthrough(void);
-extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_apply_erratum_63(u16 devid);
-extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
-extern int amd_iommu_init_devices(void);
-extern void amd_iommu_uninit_devices(void);
-extern void amd_iommu_init_notifier(void);
-extern void amd_iommu_init_api(void);
-#ifndef CONFIG_AMD_IOMMU_STATS
-
-static inline void amd_iommu_stats_init(void) { }
-
-#endif /* !CONFIG_AMD_IOMMU_STATS */
-
-static inline bool is_rd890_iommu(struct pci_dev *pdev)
-{
- return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
- (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
-}
-
-static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
-{
- if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
- return false;
-
- return !!(iommu->features & f);
-}
-
-#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
deleted file mode 100644
index 4c998299541..00000000000
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
-#define _ASM_X86_AMD_IOMMU_TYPES_H
-
-#include <linux/types.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-/*
- * Maximum number of IOMMUs supported
- */
-#define MAX_IOMMUS 32
-
-/*
- * some size calculation constants
- */
-#define DEV_TABLE_ENTRY_SIZE 32
-#define ALIAS_TABLE_ENTRY_SIZE 2
-#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
-
-/* Length of the MMIO region for the AMD IOMMU */
-#define MMIO_REGION_LENGTH 0x4000
-
-/* Capability offsets used by the driver */
-#define MMIO_CAP_HDR_OFFSET 0x00
-#define MMIO_RANGE_OFFSET 0x0c
-#define MMIO_MISC_OFFSET 0x10
-
-/* Masks, shifts and macros to parse the device range capability */
-#define MMIO_RANGE_LD_MASK 0xff000000
-#define MMIO_RANGE_FD_MASK 0x00ff0000
-#define MMIO_RANGE_BUS_MASK 0x0000ff00
-#define MMIO_RANGE_LD_SHIFT 24
-#define MMIO_RANGE_FD_SHIFT 16
-#define MMIO_RANGE_BUS_SHIFT 8
-#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
-#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
-#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
-#define MMIO_MSI_NUM(x) ((x) & 0x1f)
-
-/* Flag masks for the AMD IOMMU exclusion range */
-#define MMIO_EXCL_ENABLE_MASK 0x01ULL
-#define MMIO_EXCL_ALLOW_MASK 0x02ULL
-
-/* Used offsets into the MMIO space */
-#define MMIO_DEV_TABLE_OFFSET 0x0000
-#define MMIO_CMD_BUF_OFFSET 0x0008
-#define MMIO_EVT_BUF_OFFSET 0x0010
-#define MMIO_CONTROL_OFFSET 0x0018
-#define MMIO_EXCL_BASE_OFFSET 0x0020
-#define MMIO_EXCL_LIMIT_OFFSET 0x0028
-#define MMIO_EXT_FEATURES 0x0030
-#define MMIO_CMD_HEAD_OFFSET 0x2000
-#define MMIO_CMD_TAIL_OFFSET 0x2008
-#define MMIO_EVT_HEAD_OFFSET 0x2010
-#define MMIO_EVT_TAIL_OFFSET 0x2018
-#define MMIO_STATUS_OFFSET 0x2020
-
-
-/* Extended Feature Bits */
-#define FEATURE_PREFETCH (1ULL<<0)
-#define FEATURE_PPR (1ULL<<1)
-#define FEATURE_X2APIC (1ULL<<2)
-#define FEATURE_NX (1ULL<<3)
-#define FEATURE_GT (1ULL<<4)
-#define FEATURE_IA (1ULL<<6)
-#define FEATURE_GA (1ULL<<7)
-#define FEATURE_HE (1ULL<<8)
-#define FEATURE_PC (1ULL<<9)
-
-/* MMIO status bits */
-#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
-
-/* event logging constants */
-#define EVENT_ENTRY_SIZE 0x10
-#define EVENT_TYPE_SHIFT 28
-#define EVENT_TYPE_MASK 0xf
-#define EVENT_TYPE_ILL_DEV 0x1
-#define EVENT_TYPE_IO_FAULT 0x2
-#define EVENT_TYPE_DEV_TAB_ERR 0x3
-#define EVENT_TYPE_PAGE_TAB_ERR 0x4
-#define EVENT_TYPE_ILL_CMD 0x5
-#define EVENT_TYPE_CMD_HARD_ERR 0x6
-#define EVENT_TYPE_IOTLB_INV_TO 0x7
-#define EVENT_TYPE_INV_DEV_REQ 0x8
-#define EVENT_DEVID_MASK 0xffff
-#define EVENT_DEVID_SHIFT 0
-#define EVENT_DOMID_MASK 0xffff
-#define EVENT_DOMID_SHIFT 0
-#define EVENT_FLAGS_MASK 0xfff
-#define EVENT_FLAGS_SHIFT 0x10
-
-/* feature control bits */
-#define CONTROL_IOMMU_EN 0x00ULL
-#define CONTROL_HT_TUN_EN 0x01ULL
-#define CONTROL_EVT_LOG_EN 0x02ULL
-#define CONTROL_EVT_INT_EN 0x03ULL
-#define CONTROL_COMWAIT_EN 0x04ULL
-#define CONTROL_PASSPW_EN 0x08ULL
-#define CONTROL_RESPASSPW_EN 0x09ULL
-#define CONTROL_COHERENT_EN 0x0aULL
-#define CONTROL_ISOC_EN 0x0bULL
-#define CONTROL_CMDBUF_EN 0x0cULL
-#define CONTROL_PPFLOG_EN 0x0dULL
-#define CONTROL_PPFINT_EN 0x0eULL
-
-/* command specific defines */
-#define CMD_COMPL_WAIT 0x01
-#define CMD_INV_DEV_ENTRY 0x02
-#define CMD_INV_IOMMU_PAGES 0x03
-#define CMD_INV_IOTLB_PAGES 0x04
-#define CMD_INV_ALL 0x08
-
-#define CMD_COMPL_WAIT_STORE_MASK 0x01
-#define CMD_COMPL_WAIT_INT_MASK 0x02
-#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
-#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
-
-#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
-
-/* macros and definitions for device table entries */
-#define DEV_ENTRY_VALID 0x00
-#define DEV_ENTRY_TRANSLATION 0x01
-#define DEV_ENTRY_IR 0x3d
-#define DEV_ENTRY_IW 0x3e
-#define DEV_ENTRY_NO_PAGE_FAULT 0x62
-#define DEV_ENTRY_EX 0x67
-#define DEV_ENTRY_SYSMGT1 0x68
-#define DEV_ENTRY_SYSMGT2 0x69
-#define DEV_ENTRY_INIT_PASS 0xb8
-#define DEV_ENTRY_EINT_PASS 0xb9
-#define DEV_ENTRY_NMI_PASS 0xba
-#define DEV_ENTRY_LINT0_PASS 0xbe
-#define DEV_ENTRY_LINT1_PASS 0xbf
-#define DEV_ENTRY_MODE_MASK 0x07
-#define DEV_ENTRY_MODE_SHIFT 0x09
-
-/* constants to configure the command buffer */
-#define CMD_BUFFER_SIZE 8192
-#define CMD_BUFFER_UNINITIALIZED 1
-#define CMD_BUFFER_ENTRIES 512
-#define MMIO_CMD_SIZE_SHIFT 56
-#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
-
-/* constants for event buffer handling */
-#define EVT_BUFFER_SIZE 8192 /* 512 entries */
-#define EVT_LEN_MASK (0x9ULL << 56)
-
-#define PAGE_MODE_NONE 0x00
-#define PAGE_MODE_1_LEVEL 0x01
-#define PAGE_MODE_2_LEVEL 0x02
-#define PAGE_MODE_3_LEVEL 0x03
-#define PAGE_MODE_4_LEVEL 0x04
-#define PAGE_MODE_5_LEVEL 0x05
-#define PAGE_MODE_6_LEVEL 0x06
-
-#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
- ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
- (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k 0
-#define PM_ADDR_MASK 0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
- (~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
- ((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
- (1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
- ((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address an a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize) \
- (((address) | ((pagesize) - 1)) & \
- (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
- (1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-#define IOMMU_PTE_P (1ULL << 0)
-#define IOMMU_PTE_TV (1ULL << 1)
-#define IOMMU_PTE_U (1ULL << 59)
-#define IOMMU_PTE_FC (1ULL << 60)
-#define IOMMU_PTE_IR (1ULL << 61)
-#define IOMMU_PTE_IW (1ULL << 62)
-
-#define DTE_FLAG_IOTLB 0x01
-
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
-#define IOMMU_PROT_MASK 0x03
-#define IOMMU_PROT_IR 0x01
-#define IOMMU_PROT_IW 0x02
-
-/* IOMMU capabilities */
-#define IOMMU_CAP_IOTLB 24
-#define IOMMU_CAP_NPCACHE 26
-#define IOMMU_CAP_EFR 27
-
-#define MAX_DOMAIN_ID 65536
-
-/* FIXME: move this macro to <linux/pci.h> */
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
-
-/* Protection domain flags */
-#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
-#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
- domain for an IOMMU */
-#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
- translation */
-
-extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...) \
- do { \
- if (amd_iommu_dump) \
- printk(KERN_INFO "AMD-Vi: " format, ## arg); \
- } while(0);
-
-/* global flag if IOMMUs cache non-present entries */
-extern bool amd_iommu_np_cache;
-/* Only true if all IOMMUs support device IOTLBs */
-extern bool amd_iommu_iotlb_sup;
-
-/*
- * Make iterating over all IOMMUs easier
- */
-#define for_each_iommu(iommu) \
- list_for_each_entry((iommu), &amd_iommu_list, list)
-#define for_each_iommu_safe(iommu, next) \
- list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
-
-#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
-#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
-#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
-#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
-#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
-#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
-
-/*
- * This structure contains generic data for IOMMU protection domains
- * independent of their use.
- */
-struct protection_domain {
- struct list_head list; /* for list of all protection domains */
- struct list_head dev_list; /* List of all devices in this domain */
- spinlock_t lock; /* mostly used to lock the page table*/
- struct mutex api_lock; /* protect page tables in the iommu-api path */
- u16 id; /* the domain id written to the device table */
- int mode; /* paging mode (0-6 levels) */
- u64 *pt_root; /* page table root pointer */
- unsigned long flags; /* flags to find out type of domain */
- bool updated; /* complete domain flush required */
- unsigned dev_cnt; /* devices assigned to this domain */
- unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
- void *priv; /* private data */
-
-};
-
-/*
- * This struct contains device specific data for the IOMMU
- */
-struct iommu_dev_data {
- struct list_head list; /* For domain->dev_list */
- struct device *dev; /* Device this data belong to */
- struct device *alias; /* The Alias Device */
- struct protection_domain *domain; /* Domain the device is bound to */
- atomic_t bind; /* Domain attach reverent count */
-};
-
-/*
- * For dynamic growth the aperture size is split into ranges of 128MB of
- * DMA address space each. This struct represents one such range.
- */
-struct aperture_range {
-
- /* address allocation bitmap */
- unsigned long *bitmap;
-
- /*
- * Array of PTE pages for the aperture. In this array we save all the
- * leaf pages of the domain page table used for the aperture. This way
- * we don't need to walk the page table to find a specific PTE. We can
- * just calculate its address in constant time.
- */
- u64 *pte_pages[64];
-
- unsigned long offset;
-};
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
- struct list_head list;
-
- /* generic protection domain information */
- struct protection_domain domain;
-
- /* size of the aperture for the mappings */
- unsigned long aperture_size;
-
- /* address we start to search for free addresses */
- unsigned long next_address;
-
- /* address space relevant data */
- struct aperture_range *aperture[APERTURE_MAX_RANGES];
-
- /* This will be set to true when TLB needs to be flushed */
- bool need_flush;
-
- /*
- * if this is a preallocated domain, keep the device for which it was
- * preallocated in this variable
- */
- u16 target_dev;
-};
-
-/*
- * Structure where we save information about one hardware AMD IOMMU in the
- * system.
- */
-struct amd_iommu {
- struct list_head list;
-
- /* Index within the IOMMU array */
- int index;
-
- /* locks the accesses to the hardware */
- spinlock_t lock;
-
- /* Pointer to PCI device of this IOMMU */
- struct pci_dev *dev;
-
- /* physical address of MMIO space */
- u64 mmio_phys;
- /* virtual address of MMIO space */
- u8 *mmio_base;
-
- /* capabilities of that IOMMU read from ACPI */
- u32 cap;
-
- /* flags read from acpi table */
- u8 acpi_flags;
-
- /* Extended features */
- u64 features;
-
- /*
- * Capability pointer. There could be more than one IOMMU per PCI
- * device function if there are more than one AMD IOMMU capability
- * pointers.
- */
- u16 cap_ptr;
-
- /* pci domain of this IOMMU */
- u16 pci_seg;
-
- /* first device this IOMMU handles. read from PCI */
- u16 first_device;
- /* last device this IOMMU handles. read from PCI */
- u16 last_device;
-
- /* start of exclusion range of that IOMMU */
- u64 exclusion_start;
- /* length of exclusion range of that IOMMU */
- u64 exclusion_length;
-
- /* command buffer virtual address */
- u8 *cmd_buf;
- /* size of command buffer */
- u32 cmd_buf_size;
-
- /* size of event buffer */
- u32 evt_buf_size;
- /* event buffer virtual address */
- u8 *evt_buf;
- /* MSI number for event interrupt */
- u16 evt_msi_num;
-
- /* true if interrupts for this IOMMU are already enabled */
- bool int_enabled;
-
- /* if one, we need to send a completion wait command */
- bool need_sync;
-
- /* default dma_ops domain for that IOMMU */
- struct dma_ops_domain *default_dom;
-
- /*
- * We can't rely on the BIOS to restore all values on reinit, so we
- * need to stash them
- */
-
- /* The iommu BAR */
- u32 stored_addr_lo;
- u32 stored_addr_hi;
-
- /*
- * Each iommu has 6 l1s, each of which is documented as having 0x12
- * registers
- */
- u32 stored_l1[6][0x12];
-
- /* The l2 indirect registers */
- u32 stored_l2[0x83];
-};
-
-/*
- * List with all IOMMUs in the system. This list is not locked because it is
- * only written and read at driver initialization or suspend time
- */
-extern struct list_head amd_iommu_list;
-
-/*
- * Array with pointers to each IOMMU struct
- * The indices are referenced in the protection domains
- */
-extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
-
-/* Number of IOMMUs present in the system */
-extern int amd_iommus_present;
-
-/*
- * Declarations for the global list of all protection domains
- */
-extern spinlock_t amd_iommu_pd_lock;
-extern struct list_head amd_iommu_pd_list;
-
-/*
- * Structure defining one entry in the device table
- */
-struct dev_table_entry {
- u32 data[8];
-};
-
-/*
- * One entry for unity mappings parsed out of the ACPI table.
- */
-struct unity_map_entry {
- struct list_head list;
-
- /* starting device id this entry is used for (including) */
- u16 devid_start;
- /* end device id this entry is used for (including) */
- u16 devid_end;
-
- /* start address to unity map (including) */
- u64 address_start;
- /* end address to unity map (including) */
- u64 address_end;
-
- /* required protection */
- int prot;
-};
-
-/*
- * List of all unity mappings. It is not locked because as runtime it is only
- * read. It is created at ACPI table parsing time.
- */
-extern struct list_head amd_iommu_unity_map;
-
-/*
- * Data structures for device handling
- */
-
-/*
- * Device table used by hardware. Read and write accesses by software are
- * locked with the amd_iommu_pd_table lock.
- */
-extern struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * Alias table to find requestor ids to device ids. Not locked because only
- * read on runtime.
- */
-extern u16 *amd_iommu_alias_table;
-
-/*
- * Reverse lookup table to find the IOMMU which translates a specific device.
- */
-extern struct amd_iommu **amd_iommu_rlookup_table;
-
-/* size of the dma_ops aperture as power of 2 */
-extern unsigned amd_iommu_aperture_order;
-
-/* largest PCI device id we expect translation requests for */
-extern u16 amd_iommu_last_bdf;
-
-/* allocation bitmap for domain ids */
-extern unsigned long *amd_iommu_pd_alloc_bitmap;
-
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
-/* takes bus and device/function and returns the device id
- * FIXME: should that be in generic PCI code? */
-static inline u16 calc_devid(u8 bus, u8 devfn)
-{
- return (((u16)bus) << 8) | devfn;
-}
-
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-struct __iommu_counter {
- char *name;
- struct dentry *dent;
- u64 value;
-};
-
-#define DECLARE_STATS_COUNTER(nm) \
- static struct __iommu_counter nm = { \
- .name = #nm, \
- }
-
-#define INC_STATS_COUNTER(name) name.value += 1
-#define ADD_STATS_COUNTER(name, x) name.value += (x)
-#define SUB_STATS_COUNTER(name, x) name.value -= (x)
-
-#else /* CONFIG_AMD_IOMMU_STATS */
-
-#define DECLARE_STATS_COUNTER(name)
-#define INC_STATS_COUNTER(name)
-#define ADD_STATS_COUNTER(name, x)
-#define SUB_STATS_COUNTER(name, x)
-
-#endif /* CONFIG_AMD_IOMMU_STATS */
-
-#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index af60d8a2e28..082cf818493 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -50,7 +50,6 @@
#define APBT_DEV_USED 1
extern void apbt_time_init(void);
-extern struct clock_event_device *global_clock_event;
extern unsigned long apbt_quick_calibrate(void);
extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
extern void apbt_setup_secondary_clock(void);
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b3ed1e1460f..9412d6558c8 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,9 +3,11 @@
#ifdef __ASSEMBLY__
# define __ASM_FORM(x) x
+# define __ASM_FORM_COMMA(x) x,
# define __ASM_EX_SEC .section __ex_table, "a"
#else
# define __ASM_FORM(x) " " #x " "
+# define __ASM_FORM_COMMA(x) " " #x ","
# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
#endif
@@ -15,7 +17,8 @@
# define __ASM_SEL(a,b) __ASM_FORM(b)
#endif
-#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
+#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
+ inst##q##__VA_ARGS__)
#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
#define _ASM_PTR __ASM_SEL(.long, .quad)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 30af5a83216..a9e3a740f69 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,6 +46,7 @@ For 32-bit we have the following conventions - kernel is built with
*/
+#include "dwarf2.h"
/*
* 64-bit system call stack frame layout defines and helpers, for
@@ -84,72 +85,57 @@ For 32-bit we have the following conventions - kernel is built with
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
- .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
+ .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
subq $9*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 9*8+\addskip
- movq %rdi, 8*8(%rsp)
- CFI_REL_OFFSET rdi, 8*8
- movq %rsi, 7*8(%rsp)
- CFI_REL_OFFSET rsi, 7*8
- movq %rdx, 6*8(%rsp)
- CFI_REL_OFFSET rdx, 6*8
- .if \norcx
- .else
- movq %rcx, 5*8(%rsp)
- CFI_REL_OFFSET rcx, 5*8
+ movq_cfi rdi, 8*8
+ movq_cfi rsi, 7*8
+ movq_cfi rdx, 6*8
+
+ .if \save_rcx
+ movq_cfi rcx, 5*8
.endif
- movq %rax, 4*8(%rsp)
- CFI_REL_OFFSET rax, 4*8
- .if \nor891011
- .else
- movq %r8, 3*8(%rsp)
- CFI_REL_OFFSET r8, 3*8
- movq %r9, 2*8(%rsp)
- CFI_REL_OFFSET r9, 2*8
- movq %r10, 1*8(%rsp)
- CFI_REL_OFFSET r10, 1*8
- movq %r11, (%rsp)
- CFI_REL_OFFSET r11, 0*8
+
+ movq_cfi rax, 4*8
+
+ .if \save_r891011
+ movq_cfi r8, 3*8
+ movq_cfi r9, 2*8
+ movq_cfi r10, 1*8
+ movq_cfi r11, 0*8
.endif
+
.endm
#define ARG_SKIP (9*8)
- .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
- skipr8910=0, skiprdx=0
- .if \skipr11
- .else
- movq (%rsp), %r11
- CFI_RESTORE r11
+ .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
+ rstor_r8910=1, rstor_rdx=1
+ .if \rstor_r11
+ movq_cfi_restore 0*8, r11
.endif
- .if \skipr8910
- .else
- movq 1*8(%rsp), %r10
- CFI_RESTORE r10
- movq 2*8(%rsp), %r9
- CFI_RESTORE r9
- movq 3*8(%rsp), %r8
- CFI_RESTORE r8
+
+ .if \rstor_r8910
+ movq_cfi_restore 1*8, r10
+ movq_cfi_restore 2*8, r9
+ movq_cfi_restore 3*8, r8
.endif
- .if \skiprax
- .else
- movq 4*8(%rsp), %rax
- CFI_RESTORE rax
+
+ .if \rstor_rax
+ movq_cfi_restore 4*8, rax
.endif
- .if \skiprcx
- .else
- movq 5*8(%rsp), %rcx
- CFI_RESTORE rcx
+
+ .if \rstor_rcx
+ movq_cfi_restore 5*8, rcx
.endif
- .if \skiprdx
- .else
- movq 6*8(%rsp), %rdx
- CFI_RESTORE rdx
+
+ .if \rstor_rdx
+ movq_cfi_restore 6*8, rdx
.endif
- movq 7*8(%rsp), %rsi
- CFI_RESTORE rsi
- movq 8*8(%rsp), %rdi
- CFI_RESTORE rdi
+
+ movq_cfi_restore 7*8, rsi
+ movq_cfi_restore 8*8, rdi
+
.if ARG_SKIP+\addskip > 0
addq $ARG_SKIP+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
@@ -176,33 +162,21 @@ For 32-bit we have the following conventions - kernel is built with
.macro SAVE_REST
subq $REST_SKIP, %rsp
CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq %rbx, 5*8(%rsp)
- CFI_REL_OFFSET rbx, 5*8
- movq %rbp, 4*8(%rsp)
- CFI_REL_OFFSET rbp, 4*8
- movq %r12, 3*8(%rsp)
- CFI_REL_OFFSET r12, 3*8
- movq %r13, 2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
- movq %r14, 1*8(%rsp)
- CFI_REL_OFFSET r14, 1*8
- movq %r15, (%rsp)
- CFI_REL_OFFSET r15, 0*8
+ movq_cfi rbx, 5*8
+ movq_cfi rbp, 4*8
+ movq_cfi r12, 3*8
+ movq_cfi r13, 2*8
+ movq_cfi r14, 1*8
+ movq_cfi r15, 0*8
.endm
.macro RESTORE_REST
- movq (%rsp), %r15
- CFI_RESTORE r15
- movq 1*8(%rsp), %r14
- CFI_RESTORE r14
- movq 2*8(%rsp), %r13
- CFI_RESTORE r13
- movq 3*8(%rsp), %r12
- CFI_RESTORE r12
- movq 4*8(%rsp), %rbp
- CFI_RESTORE rbp
- movq 5*8(%rsp), %rbx
- CFI_RESTORE rbx
+ movq_cfi_restore 0*8, r15
+ movq_cfi_restore 1*8, r14
+ movq_cfi_restore 2*8, r13
+ movq_cfi_restore 3*8, r12
+ movq_cfi_restore 4*8, rbp
+ movq_cfi_restore 5*8, rbx
addq $REST_SKIP, %rsp
CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
.endm
@@ -214,7 +188,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro RESTORE_ALL addskip=0
RESTORE_REST
- RESTORE_ARGS 0, \addskip
+ RESTORE_ARGS 1, \addskip
.endm
.macro icebp
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 1cd6d26a0a8..0baa628e330 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -53,8 +53,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif
-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
#endif
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 2c6fc9e6281..3b629f47eb6 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,5 +1,6 @@
#ifdef __ASSEMBLY__
+#include <asm/asm.h>
#include <asm/dwarf2.h>
/* The annotation hides the frame from the unwinder and makes it look
@@ -7,13 +8,13 @@
frame pointer later */
#ifdef CONFIG_FRAME_POINTER
.macro FRAME
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp,0
- movl %esp,%ebp
+ __ASM_SIZE(push,_cfi) %__ASM_REG(bp)
+ CFI_REL_OFFSET __ASM_REG(bp), 0
+ __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp)
.endm
.macro ENDFRAME
- popl_cfi %ebp
- CFI_RESTORE ebp
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(bp)
+ CFI_RESTORE __ASM_REG(bp)
.endm
#else
.macro FRAME
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index bb9efe8706e..13f5504c76c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -34,7 +34,6 @@ extern void irq_work_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
-extern void mce_self_interrupt(void);
extern void invalidate_interrupt(void);
extern void invalidate_interrupt0(void);
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
deleted file mode 100644
index 65aaa91d585..00000000000
--- a/arch/x86/include/asm/i8253.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _ASM_X86_I8253_H
-#define _ASM_X86_I8253_H
-
-/* i8253A PIT registers */
-#define PIT_MODE 0x43
-#define PIT_CH0 0x40
-#define PIT_CH2 0x42
-
-#define PIT_LATCH LATCH
-
-extern raw_spinlock_t i8253_lock;
-
-extern struct clock_event_device *global_clock_event;
-
-extern void setup_pit_timer(void);
-
-#define inb_pit inb_p
-#define outb_pit outb_p
-
-#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e976ee3b3e..6665026ea3e 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -109,11 +109,6 @@
#define UV_BAU_MESSAGE 0xf5
-/*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR 0xf4
-
/* Xen vector callback to receive events in a HVM domain */
#define XEN_HVM_EVTCHN_CALLBACK 0xf3
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 5745ce8bf10..bba3cf88e62 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -60,23 +60,24 @@ static inline void native_halt(void)
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
+#include <linux/types.h>
-static inline unsigned long arch_local_save_flags(void)
+static inline notrace unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}
-static inline void arch_local_irq_restore(unsigned long flags)
+static inline notrace void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
-static inline void arch_local_irq_disable(void)
+static inline notrace void arch_local_irq_disable(void)
{
native_irq_disable();
}
-static inline void arch_local_irq_enable(void)
+static inline notrace void arch_local_irq_enable(void)
{
native_irq_enable();
}
@@ -102,7 +103,7 @@ static inline void halt(void)
/*
* For spinlocks, etc:
*/
-static inline unsigned long arch_local_irq_save(void)
+static inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long flags = arch_local_save_flags();
arch_local_irq_disable();
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index b60f2924c41..879fd7d3387 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -61,6 +61,7 @@ hcall(unsigned long call,
: "memory");
return call;
}
+/*:*/
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 021979a6e23..716b48af786 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -8,6 +8,7 @@
* Machine Check support for x86
*/
+/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
@@ -17,10 +18,12 @@
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
+/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
+/* MCi_STATUS register defines */
#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
@@ -31,12 +34,14 @@
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
-/* MISC register defines */
-#define MCM_ADDR_SEGOFF 0 /* segment offset */
-#define MCM_ADDR_LINEAR 1 /* linear address */
-#define MCM_ADDR_PHYS 2 /* physical address */
-#define MCM_ADDR_MEM 3 /* memory address */
-#define MCM_ADDR_GENERIC 7 /* generic */
+/* MCi_MISC register defines */
+#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f)
+#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7)
+#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */
+#define MCI_MISC_ADDR_LINEAR 1 /* linear address */
+#define MCI_MISC_ADDR_PHYS 2 /* physical address */
+#define MCI_MISC_ADDR_MEM 3 /* memory address */
+#define MCI_MISC_ADDR_GENERIC 7 /* generic */
/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN (1ULL << 30)
@@ -144,7 +149,7 @@ static inline void enable_p5_mce(void) {}
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_dev);
+DECLARE_PER_CPU(struct sys_device, mce_sysdev);
/*
* Maximum banks number.
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 485b4f1f079..d96bdb25ca3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -259,6 +259,9 @@
#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_POWERSAVE 15
#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a0a9779084d..3470c9d0ebb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -388,12 +388,9 @@ do { \
#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
-/*
- * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
- * faster than an xchg with forced lock semantics.
- */
-#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val)
#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -485,6 +482,8 @@ do { \
#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index d9d4dae305f..094fb30817a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
(regs)->bp = caller_frame_pointer(); \
(regs)->cs = __KERNEL_CS; \
regs->flags = 0; \
+ asm volatile( \
+ _ASM_MOV "%%"_ASM_SP ", %0\n" \
+ : "=m" ((regs)->sp) \
+ :: "memory" \
+ ); \
}
#else
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 56fd9e3abbd..4f7e67e2345 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -102,6 +102,14 @@
#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
/*
+ * If an event has alias it should be marked
+ * with a special bit. (Don't forget to check
+ * P4_PEBS_CONFIG_MASK and related bits on
+ * modification.)
+ */
+#define P4_CONFIG_ALIASABLE (1 << 9)
+
+/*
* The bits we allow to pass for RAW events
*/
#define P4_CONFIG_MASK_ESCR \
@@ -123,6 +131,31 @@
(p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
(p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+/*
+ * In case of event aliasing we need to preserve some
+ * caller bits, otherwise the mapping won't be complete.
+ */
+#define P4_CONFIG_EVENT_ALIAS_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \
+ p4_config_pack_cccr(P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE))
+
+#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \
+ ((P4_CONFIG_HT) | \
+ p4_config_pack_escr(P4_ESCR_T0_OS | \
+ P4_ESCR_T0_USR | \
+ P4_ESCR_T1_OS | \
+ P4_ESCR_T1_USR) | \
+ p4_config_pack_cccr(P4_CCCR_OVF | \
+ P4_CCCR_CASCADE | \
+ P4_CCCR_FORCE_OVF | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_OVF_PMI_T0 | \
+ P4_CCCR_OVF_PMI_T1 | \
+ P4_CONFIG_ALIASABLE))
+
static inline bool p4_is_event_cascaded(u64 config)
{
u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 971e0b46446..df1287019e6 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -30,17 +30,6 @@ extern void add_dtb(u64 data);
extern void x86_add_irq_domains(void);
void __cpuinit x86_of_pci_init(void);
void x86_dtb_init(void);
-
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
-{
- return pdev ? pdev->dev.of_node : NULL;
-}
-
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
- return pci_device_to_OF_node(bus->self);
-}
-
#else
static inline void add_dtb(u64 data) { }
static inline void x86_add_irq_domains(void) { }
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
index 6a8c0d64510..a5370a03d90 100644
--- a/arch/x86/include/asm/rwlock.h
+++ b/arch/x86/include/asm/rwlock.h
@@ -1,7 +1,48 @@
#ifndef _ASM_X86_RWLOCK_H
#define _ASM_X86_RWLOCK_H
-#define RW_LOCK_BIAS 0x01000000
+#include <asm/asm.h>
+
+#if CONFIG_NR_CPUS <= 2048
+
+#ifndef __ASSEMBLY__
+typedef union {
+ s32 lock;
+ s32 write;
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS 0x00100000
+#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##l)
+#define READ_LOCK_ATOMIC(n) atomic_##n
+#define WRITE_LOCK_ADD(n) __ASM_FORM_COMMA(addl n)
+#define WRITE_LOCK_SUB(n) __ASM_FORM_COMMA(subl n)
+#define WRITE_LOCK_CMP RW_LOCK_BIAS
+
+#else /* CONFIG_NR_CPUS > 2048 */
+
+#include <linux/const.h>
+
+#ifndef __ASSEMBLY__
+typedef union {
+ s64 lock;
+ struct {
+ u32 read;
+ s32 write;
+ };
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS (_AC(1,L) << 32)
+#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##q)
+#define READ_LOCK_ATOMIC(n) atomic64_##n
+#define WRITE_LOCK_ADD(n) __ASM_FORM(incl)
+#define WRITE_LOCK_SUB(n) __ASM_FORM(decl)
+#define WRITE_LOCK_CMP 1
+
+#endif /* CONFIG_NR_CPUS */
+
+#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index cd84f7208f7..5e641715c3f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -162,7 +162,7 @@
#define GDT_ENTRY_DEFAULT_USER32_CS 4
#define GDT_ENTRY_DEFAULT_USER_DS 5
#define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
#define __USER32_DS __USER_DS
#define GDT_ENTRY_TSS 8 /* needs two entries */
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 725b7783199..49adfd7bb4a 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -10,7 +10,11 @@ static inline void smpboot_clear_io_apic_irqs(void)
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0xa, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
local_flush_tlb();
pr_debug("1.\n");
*((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
@@ -23,6 +27,8 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
static inline void smpboot_restore_warm_reset_vector(void)
{
+ unsigned long flags;
+
/*
* Install writable page 0 entry to set BIOS data area.
*/
@@ -32,7 +38,9 @@ static inline void smpboot_restore_warm_reset_vector(void)
* Paranoid: Set warm reset code and vector here back
* to default values.
*/
+ spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
*((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
}
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3089f70c0c5..e9e51f710e6 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -2,7 +2,6 @@
#define _ASM_X86_SPINLOCK_H
#include <asm/atomic.h>
-#include <asm/rwlock.h>
#include <asm/page.h>
#include <asm/processor.h>
#include <linux/compiler.h>
@@ -234,7 +233,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
*/
static inline int arch_read_can_lock(arch_rwlock_t *lock)
{
- return (int)(lock)->lock > 0;
+ return lock->lock > 0;
}
/**
@@ -243,12 +242,12 @@ static inline int arch_read_can_lock(arch_rwlock_t *lock)
*/
static inline int arch_write_can_lock(arch_rwlock_t *lock)
{
- return (lock)->lock == RW_LOCK_BIAS;
+ return lock->write == WRITE_LOCK_CMP;
}
static inline void arch_read_lock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
"jns 1f\n"
"call __read_lock_failed\n\t"
"1:\n"
@@ -257,47 +256,55 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
static inline void arch_write_lock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+ asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
"jz 1f\n"
"call __write_lock_failed\n\t"
"1:\n"
- ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+ ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+ : "memory");
}
static inline int arch_read_trylock(arch_rwlock_t *lock)
{
- atomic_t *count = (atomic_t *)lock;
+ READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
- if (atomic_dec_return(count) >= 0)
+ if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
return 1;
- atomic_inc(count);
+ READ_LOCK_ATOMIC(inc)(count);
return 0;
}
static inline int arch_write_trylock(arch_rwlock_t *lock)
{
- atomic_t *count = (atomic_t *)lock;
+ atomic_t *count = (atomic_t *)&lock->write;
- if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+ if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
return 1;
- atomic_add(RW_LOCK_BIAS, count);
+ atomic_add(WRITE_LOCK_CMP, count);
return 0;
}
static inline void arch_read_unlock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+ :"+m" (rw->lock) : : "memory");
}
static inline void arch_write_unlock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX "addl %1, %0"
- : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+ asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+ : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
}
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
#define arch_spin_relax(lock) cpu_relax()
#define arch_read_relax(lock) cpu_relax()
#define arch_write_relax(lock) cpu_relax()
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index dcb48b2edc1..7c7a486fcb6 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -11,10 +11,6 @@ typedef struct arch_spinlock {
#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
-typedef struct {
- unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
+#include <asm/rwlock.h>
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 7bdec4e9b73..92b8aec0697 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -1,10 +1,12 @@
#ifndef _ASM_X86_TIME_H
#define _ASM_X86_TIME_H
-extern void hpet_time_init(void);
-
+#include <linux/clocksource.h>
#include <asm/mc146818rtc.h>
+extern void hpet_time_init(void);
extern void time_init(void);
+extern struct clock_event_device *global_clock_event;
+
#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 99ddd148a76..36361bf6fdd 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -555,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; };
#endif /* CONFIG_X86_WP_WORKS_OK */
+extern unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+
/*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 4fbda9a3f33..968d57dd54c 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -14,13 +14,14 @@ static inline int pci_xen_hvm_init(void)
}
#endif
#if defined(CONFIG_XEN_DOM0)
-void __init xen_setup_pirqs(void);
+int __init pci_xen_initial_domain(void);
int xen_find_device_domain_owner(struct pci_dev *dev);
int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
int xen_unregister_device_domain_owner(struct pci_dev *dev);
#else
-static inline void __init xen_setup_pirqs(void)
+static inline int __init pci_xen_initial_domain(void)
{
+ return -1;
}
static inline int xen_find_device_domain_owner(struct pci_dev *dev)
{
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90b06d4daee..11817ff8539 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -123,7 +123,6 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
- obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 7c3a95e54ec..00000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2764 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/pci-ats.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <linux/delay.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/dma.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-
-#define LOOP_TIMEOUT 100000
-
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-
-/*
- * Domain for untranslated devices - only allocated
- * if iommu=pt passed on kernel cmd line.
- */
-static struct protection_domain *pt_domain;
-
-static struct iommu_ops amd_iommu_ops;
-
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
- u32 data[4];
-};
-
-static void update_domain(struct protection_domain *domain);
-
-/****************************************************************************
- *
- * Helper functions
- *
- ****************************************************************************/
-
-static inline u16 get_device_id(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
-
- return calc_devid(pdev->bus->number, pdev->devfn);
-}
-
-static struct iommu_dev_data *get_dev_data(struct device *dev)
-{
- return dev->archdata.iommu;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
- struct dma_ops_domain *entry, *ret = NULL;
- unsigned long flags;
- u16 alias = amd_iommu_alias_table[devid];
-
- if (list_empty(&iommu_pd_list))
- return NULL;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
- list_for_each_entry(entry, &iommu_pd_list, list) {
- if (entry->target_dev == devid ||
- entry->target_dev == alias) {
- ret = entry;
- break;
- }
- }
-
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- return ret;
-}
-
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
- u16 devid;
-
- if (!dev || !dev->dma_mask)
- return false;
-
- /* No device or no PCI device */
- if (dev->bus != &pci_bus_type)
- return false;
-
- devid = get_device_id(dev);
-
- /* Out of our scope? */
- if (devid > amd_iommu_last_bdf)
- return false;
-
- if (amd_iommu_rlookup_table[devid] == NULL)
- return false;
-
- return true;
-}
-
-static int iommu_init_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct pci_dev *pdev;
- u16 devid, alias;
-
- if (dev->archdata.iommu)
- return 0;
-
- dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
- if (!dev_data)
- return -ENOMEM;
-
- dev_data->dev = dev;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
- pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
- if (pdev)
- dev_data->alias = &pdev->dev;
- else {
- kfree(dev_data);
- return -ENOTSUPP;
- }
-
- atomic_set(&dev_data->bind, 0);
-
- dev->archdata.iommu = dev_data;
-
-
- return 0;
-}
-
-static void iommu_ignore_device(struct device *dev)
-{
- u16 devid, alias;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
-
- memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
- memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
-
- amd_iommu_rlookup_table[devid] = NULL;
- amd_iommu_rlookup_table[alias] = NULL;
-}
-
-static void iommu_uninit_device(struct device *dev)
-{
- kfree(dev->archdata.iommu);
-}
-
-void __init amd_iommu_uninit_devices(void)
-{
- struct pci_dev *pdev = NULL;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- iommu_uninit_device(&pdev->dev);
- }
-}
-
-int __init amd_iommu_init_devices(void)
-{
- struct pci_dev *pdev = NULL;
- int ret = 0;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- ret = iommu_init_device(&pdev->dev);
- if (ret == -ENOTSUPP)
- iommu_ignore_device(&pdev->dev);
- else if (ret)
- goto out_free;
- }
-
- return 0;
-
-out_free:
-
- amd_iommu_uninit_devices();
-
- return ret;
-}
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-/*
- * Initialization code for statistics collection
- */
-
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-
-static struct dentry *stats_dir;
-static struct dentry *de_fflush;
-
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
- if (stats_dir == NULL)
- return;
-
- cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
- &cnt->value);
-}
-
-static void amd_iommu_stats_init(void)
-{
- stats_dir = debugfs_create_dir("amd-iommu", NULL);
- if (stats_dir == NULL)
- return;
-
- de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
- (u32 *)&amd_iommu_unmap_flush);
-
- amd_iommu_stats_add(&compl_wait);
- amd_iommu_stats_add(&cnt_map_single);
- amd_iommu_stats_add(&cnt_unmap_single);
- amd_iommu_stats_add(&cnt_map_sg);
- amd_iommu_stats_add(&cnt_unmap_sg);
- amd_iommu_stats_add(&cnt_alloc_coherent);
- amd_iommu_stats_add(&cnt_free_coherent);
- amd_iommu_stats_add(&cross_page);
- amd_iommu_stats_add(&domain_flush_single);
- amd_iommu_stats_add(&domain_flush_all);
- amd_iommu_stats_add(&alloced_io_mem);
- amd_iommu_stats_add(&total_map_requests);
-}
-
-#endif
-
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-
-static void dump_dte_entry(u16 devid)
-{
- int i;
-
- for (i = 0; i < 8; ++i)
- pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
- amd_iommu_dev_table[devid].data[i]);
-}
-
-static void dump_command(unsigned long phys_addr)
-{
- struct iommu_cmd *cmd = phys_to_virt(phys_addr);
- int i;
-
- for (i = 0; i < 4; ++i)
- pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
-}
-
-static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
-{
- u32 *event = __evt;
- int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
- int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
- int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
- int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
- u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-
- printk(KERN_ERR "AMD-Vi: Event logged [");
-
- switch (type) {
- case EVENT_TYPE_ILL_DEV:
- printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- dump_dte_entry(devid);
- break;
- case EVENT_TYPE_IO_FAULT:
- printk("IO_PAGE_FAULT device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_DEV_TAB_ERR:
- printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- case EVENT_TYPE_PAGE_TAB_ERR:
- printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_ILL_CMD:
- printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
- dump_command(address);
- break;
- case EVENT_TYPE_CMD_HARD_ERR:
- printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
- "flags=0x%04x]\n", address, flags);
- break;
- case EVENT_TYPE_IOTLB_INV_TO:
- printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
- "address=0x%016llx]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address);
- break;
- case EVENT_TYPE_INV_DEV_REQ:
- printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- default:
- printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
- }
-}
-
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
- u32 head, tail;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- while (head != tail) {
- iommu_print_event(iommu, iommu->evt_buf + head);
- head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
- }
-
- writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-
- spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-irqreturn_t amd_iommu_int_thread(int irq, void *data)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_poll_events(iommu);
-
- return IRQ_HANDLED;
-}
-
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
- return IRQ_WAKE_THREAD;
-}
-
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-
-static int wait_on_sem(volatile u64 *sem)
-{
- int i = 0;
-
- while (*sem == 0 && i < LOOP_TIMEOUT) {
- udelay(1);
- i += 1;
- }
-
- if (i == LOOP_TIMEOUT) {
- pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
- return -EIO;
- }
-
- return 0;
-}
-
-static void copy_cmd_to_buffer(struct amd_iommu *iommu,
- struct iommu_cmd *cmd,
- u32 tail)
-{
- u8 *target;
-
- target = iommu->cmd_buf + tail;
- tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-
- /* Copy command to buffer */
- memcpy(target, cmd, sizeof(*cmd));
-
- /* Tell the IOMMU about it */
- writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-}
-
-static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
-{
- WARN_ON(address & 0x7ULL);
-
- memset(cmd, 0, sizeof(*cmd));
- cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
- cmd->data[1] = upper_32_bits(__pa(address));
- cmd->data[2] = 1;
- CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
-}
-
-static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
-{
- memset(cmd, 0, sizeof(*cmd));
- cmd->data[0] = devid;
- CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
-}
-
-static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
- size_t size, u16 domid, int pde)
-{
- u64 pages;
- int s;
-
- pages = iommu_num_pages(address, size, PAGE_SIZE);
- s = 0;
-
- if (pages > 1) {
- /*
- * If we have to flush more than one page, flush all
- * TLB entries for this domain
- */
- address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
- s = 1;
- }
-
- address &= PAGE_MASK;
-
- memset(cmd, 0, sizeof(*cmd));
- cmd->data[1] |= domid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
- u64 address, size_t size)
-{
- u64 pages;
- int s;
-
- pages = iommu_num_pages(address, size, PAGE_SIZE);
- s = 0;
-
- if (pages > 1) {
- /*
- * If we have to flush more than one page, flush all
- * TLB entries for this domain
- */
- address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
- s = 1;
- }
-
- address &= PAGE_MASK;
-
- memset(cmd, 0, sizeof(*cmd));
- cmd->data[0] = devid;
- cmd->data[0] |= (qdep & 0xff) << 24;
- cmd->data[1] = devid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
- if (s)
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-}
-
-static void build_inv_all(struct iommu_cmd *cmd)
-{
- memset(cmd, 0, sizeof(*cmd));
- CMD_SET_TYPE(cmd, CMD_INV_ALL);
-}
-
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command.
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- u32 left, tail, head, next_tail;
- unsigned long flags;
-
- WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
-
-again:
- spin_lock_irqsave(&iommu->lock, flags);
-
- head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
- next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
- left = (head - next_tail) % iommu->cmd_buf_size;
-
- if (left <= 2) {
- struct iommu_cmd sync_cmd;
- volatile u64 sem = 0;
- int ret;
-
- build_completion_wait(&sync_cmd, (u64)&sem);
- copy_cmd_to_buffer(iommu, &sync_cmd, tail);
-
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- if ((ret = wait_on_sem(&sem)) != 0)
- return ret;
-
- goto again;
- }
-
- copy_cmd_to_buffer(iommu, cmd, tail);
-
- /* We need to sync now to make sure all commands are processed */
- iommu->need_sync = true;
-
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- return 0;
-}
-
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
- struct iommu_cmd cmd;
- volatile u64 sem = 0;
- int ret;
-
- if (!iommu->need_sync)
- return 0;
-
- build_completion_wait(&cmd, (u64)&sem);
-
- ret = iommu_queue_command(iommu, &cmd);
- if (ret)
- return ret;
-
- return wait_on_sem(&sem);
-}
-
-static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
-{
- struct iommu_cmd cmd;
-
- build_inv_dte(&cmd, devid);
-
- return iommu_queue_command(iommu, &cmd);
-}
-
-static void iommu_flush_dte_all(struct amd_iommu *iommu)
-{
- u32 devid;
-
- for (devid = 0; devid <= 0xffff; ++devid)
- iommu_flush_dte(iommu, devid);
-
- iommu_completion_wait(iommu);
-}
-
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-static void iommu_flush_tlb_all(struct amd_iommu *iommu)
-{
- u32 dom_id;
-
- for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
- struct iommu_cmd cmd;
- build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- dom_id, 1);
- iommu_queue_command(iommu, &cmd);
- }
-
- iommu_completion_wait(iommu);
-}
-
-static void iommu_flush_all(struct amd_iommu *iommu)
-{
- struct iommu_cmd cmd;
-
- build_inv_all(&cmd);
-
- iommu_queue_command(iommu, &cmd);
- iommu_completion_wait(iommu);
-}
-
-void iommu_flush_all_caches(struct amd_iommu *iommu)
-{
- if (iommu_feature(iommu, FEATURE_IA)) {
- iommu_flush_all(iommu);
- } else {
- iommu_flush_dte_all(iommu);
- iommu_flush_tlb_all(iommu);
- }
-}
-
-/*
- * Command send function for flushing on-device TLB
- */
-static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct amd_iommu *iommu;
- struct iommu_cmd cmd;
- u16 devid;
- int qdep;
-
- qdep = pci_ats_queue_depth(pdev);
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
-
- return iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * Command send function for invalidating a device table entry
- */
-static int device_flush_dte(struct device *dev)
-{
- struct amd_iommu *iommu;
- struct pci_dev *pdev;
- u16 devid;
- int ret;
-
- pdev = to_pci_dev(dev);
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- ret = iommu_flush_dte(iommu, devid);
- if (ret)
- return ret;
-
- if (pci_ats_enabled(pdev))
- ret = device_flush_iotlb(dev, 0, ~0UL);
-
- return ret;
-}
-
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static void __domain_flush_pages(struct protection_domain *domain,
- u64 address, size_t size, int pde)
-{
- struct iommu_dev_data *dev_data;
- struct iommu_cmd cmd;
- int ret = 0, i;
-
- build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need a TLB flush
- */
- ret |= iommu_queue_command(amd_iommus[i], &cmd);
- }
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- struct pci_dev *pdev = to_pci_dev(dev_data->dev);
-
- if (!pci_ats_enabled(pdev))
- continue;
-
- ret |= device_flush_iotlb(dev_data->dev, address, size);
- }
-
- WARN_ON(ret);
-}
-
-static void domain_flush_pages(struct protection_domain *domain,
- u64 address, size_t size)
-{
- __domain_flush_pages(domain, address, size, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain */
-static void domain_flush_tlb(struct protection_domain *domain)
-{
- __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void domain_flush_tlb_pde(struct protection_domain *domain)
-{
- __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-
-static void domain_flush_complete(struct protection_domain *domain)
-{
- int i;
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need to wait for completion of all commands.
- */
- iommu_completion_wait(amd_iommus[i]);
- }
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void domain_flush_devices(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- device_flush_dte(dev_data->dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
- gfp_t gfp)
-{
- u64 *pte;
-
- if (domain->mode == PAGE_MODE_6_LEVEL)
- /* address space already 64 bit large */
- return false;
-
- pte = (void *)get_zeroed_page(gfp);
- if (!pte)
- return false;
-
- *pte = PM_LEVEL_PDE(domain->mode,
- virt_to_phys(domain->pt_root));
- domain->pt_root = pte;
- domain->mode += 1;
- domain->updated = true;
-
- return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp)
-{
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (address > PM_LEVEL_SIZE(domain->mode))
- increase_address_space(domain, gfp);
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(*pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-{
- int level;
- u64 *pte;
-
- if (address > PM_LEVEL_SIZE(domain->mode))
- return NULL;
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == 0x07) {
- unsigned long pte_mask, __pte;
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- pte_mask = PTE_PAGE_SIZE(*pte);
- pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
- __pte = ((unsigned long)pte) & pte_mask;
-
- return (u64 *)__pte;
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot,
- unsigned long page_size)
-{
- u64 __pte, *pte;
- int i, count;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
- count = PAGE_SIZE_PTE_COUNT(page_size);
- pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
-
- for (i = 0; i < count; ++i)
- if (IOMMU_PTE_PRESENT(pte[i]))
- return -EBUSY;
-
- if (page_size > PAGE_SIZE) {
- __pte = PAGE_SIZE_PTE(phys_addr, page_size);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
- } else
- __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- update_domain(dom);
-
- return 0;
-}
-
-static unsigned long iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long page_size)
-{
- unsigned long long unmap_size, unmapped;
- u64 *pte;
-
- BUG_ON(!is_power_of_2(page_size));
-
- unmapped = 0;
-
- while (unmapped < page_size) {
-
- pte = fetch_pte(dom, bus_addr);
-
- if (!pte) {
- /*
- * No PTE for this address
- * move forward in 4kb steps
- */
- unmap_size = PAGE_SIZE;
- } else if (PM_PTE_LEVEL(*pte) == 0) {
- /* 4kb PTE found for this address */
- unmap_size = PAGE_SIZE;
- *pte = 0ULL;
- } else {
- int count, i;
-
- /* Large PTE found which maps this address */
- unmap_size = PTE_PAGE_SIZE(*pte);
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- }
-
- bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- BUG_ON(!is_power_of_2(unmapped));
-
- return unmapped;
-}
-
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
- struct unity_map_entry *entry)
-{
- u16 bdf, i;
-
- for (i = entry->devid_start; i <= entry->devid_end; ++i) {
- bdf = amd_iommu_alias_table[i];
- if (amd_iommu_rlookup_table[bdf] == iommu)
- return 1;
- }
-
- return 0;
-}
-
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e)
-{
- u64 addr;
- int ret;
-
- for (addr = e->address_start; addr < e->address_end;
- addr += PAGE_SIZE) {
- ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
- PAGE_SIZE);
- if (ret)
- return ret;
- /*
- * if unity mapping is in aperture range mark the page
- * as allocated in the aperture
- */
- if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT,
- dma_dom->aperture[0]->bitmap);
- }
-
- return 0;
-}
-
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
- struct unity_map_entry *entry;
- int ret;
-
- list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- if (!iommu_for_unity_map(iommu, entry))
- continue;
- ret = dma_ops_unity_map(iommu->default_dom, entry);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
- u16 devid)
-{
- struct unity_map_entry *e;
- int ret;
-
- list_for_each_entry(e, &amd_iommu_unity_map, list) {
- if (!(devid >= e->devid_start && devid <= e->devid_end))
- continue;
- ret = dma_ops_unity_map(dma_dom, e);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages)
-{
- unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
- if (start_page + pages > last_page)
- pages = last_page - start_page;
-
- for (i = start_page; i < start_page + pages; ++i) {
- int index = i / APERTURE_RANGE_PAGES;
- int page = i % APERTURE_RANGE_PAGES;
- __set_bit(page, dom->aperture[index]->bitmap);
- }
-}
-
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
- bool populate, gfp_t gfp)
-{
- int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
- struct amd_iommu *iommu;
- unsigned long i;
-
-#ifdef CONFIG_IOMMU_STRESS
- populate = false;
-#endif
-
- if (index >= APERTURE_MAX_RANGES)
- return -ENOMEM;
-
- dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
- if (!dma_dom->aperture[index])
- return -ENOMEM;
-
- dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
- if (!dma_dom->aperture[index]->bitmap)
- goto out_free;
-
- dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-
- if (populate) {
- unsigned long address = dma_dom->aperture_size;
- int i, num_ptes = APERTURE_RANGE_PAGES / 512;
- u64 *pte, *pte_page;
-
- for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
- &pte_page, gfp);
- if (!pte)
- goto out_free;
-
- dma_dom->aperture[index]->pte_pages[i] = pte_page;
-
- address += APERTURE_RANGE_SIZE / 64;
- }
- }
-
- dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-
- /* Initialize the exclusion range if necessary */
- for_each_iommu(iommu) {
- if (iommu->exclusion_start &&
- iommu->exclusion_start >= dma_dom->aperture[index]->offset
- && iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- startpage = iommu->exclusion_start >> PAGE_SHIFT;
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
- }
-
- /*
- * Check for areas already mapped as present in the new aperture
- * range and mark those pages as reserved in the allocator. Such
- * mappings may already exist as a result of requested unity
- * mappings for devices.
- */
- for (i = dma_dom->aperture[index]->offset;
- i < dma_dom->aperture_size;
- i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- continue;
-
- dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
- }
-
- update_domain(&dma_dom->domain);
-
- return 0;
-
-out_free:
- update_domain(&dma_dom->domain);
-
- free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-
- kfree(dma_dom->aperture[index]);
- dma_dom->aperture[index] = NULL;
-
- return -ENOMEM;
-}
-
-static unsigned long dma_ops_area_alloc(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask,
- unsigned long start)
-{
- unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
- int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
- int i = start >> APERTURE_RANGE_SHIFT;
- unsigned long boundary_size;
- unsigned long address = -1;
- unsigned long limit;
-
- next_bit >>= PAGE_SHIFT;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- for (;i < max_index; ++i) {
- unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-
- if (dom->aperture[i]->offset >= dma_mask)
- break;
-
- limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
- dma_mask >> PAGE_SHIFT);
-
- address = iommu_area_alloc(dom->aperture[i]->bitmap,
- limit, next_bit, pages, 0,
- boundary_size, align_mask);
- if (address != -1) {
- address = dom->aperture[i]->offset +
- (address << PAGE_SHIFT);
- dom->next_address = address + (pages << PAGE_SHIFT);
- break;
- }
-
- next_bit = 0;
- }
-
- return address;
-}
-
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask)
-{
- unsigned long address;
-
-#ifdef CONFIG_IOMMU_STRESS
- dom->next_address = 0;
- dom->need_flush = true;
-#endif
-
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, dom->next_address);
-
- if (address == -1) {
- dom->next_address = 0;
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, 0);
- dom->need_flush = true;
- }
-
- if (unlikely(address == -1))
- address = DMA_ERROR_CODE;
-
- WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-
- return address;
-}
-
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
- unsigned long address,
- unsigned int pages)
-{
- unsigned i = address >> APERTURE_RANGE_SHIFT;
- struct aperture_range *range = dom->aperture[i];
-
- BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-
-#ifdef CONFIG_IOMMU_STRESS
- if (i < 4)
- return;
-#endif
-
- if (address >= dom->next_address)
- dom->need_flush = true;
-
- address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-
- bitmap_clear(range->bitmap, address, pages);
-
-}
-
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-
-/*
- * This function adds a protection domain to the global protection domain list
- */
-static void add_domain_to_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_add(&domain->list, &amd_iommu_pd_list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-/*
- * This function removes a protection domain to the global
- * protection domain list
- */
-static void del_domain_from_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_del(&domain->list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static u16 domain_id_alloc(void)
-{
- unsigned long flags;
- int id;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
- BUG_ON(id == 0);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __set_bit(id, amd_iommu_pd_alloc_bitmap);
- else
- id = 0;
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return id;
-}
-
-static void domain_id_free(int id)
-{
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __clear_bit(id, amd_iommu_pd_alloc_bitmap);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void free_pagetable(struct protection_domain *domain)
-{
- int i, j;
- u64 *p1, *p2, *p3;
-
- p1 = domain->pt_root;
-
- if (!p1)
- return;
-
- for (i = 0; i < 512; ++i) {
- if (!IOMMU_PTE_PRESENT(p1[i]))
- continue;
-
- p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++j) {
- if (!IOMMU_PTE_PRESENT(p2[j]))
- continue;
- p3 = IOMMU_PTE_PAGE(p2[j]);
- free_page((unsigned long)p3);
- }
-
- free_page((unsigned long)p2);
- }
-
- free_page((unsigned long)p1);
-
- domain->pt_root = NULL;
-}
-
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
- int i;
-
- if (!dom)
- return;
-
- del_domain_from_list(&dom->domain);
-
- free_pagetable(&dom->domain);
-
- for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
- if (!dom->aperture[i])
- continue;
- free_page((unsigned long)dom->aperture[i]->bitmap);
- kfree(dom->aperture[i]);
- }
-
- kfree(dom);
-}
-
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also initializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
-{
- struct dma_ops_domain *dma_dom;
-
- dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
- if (!dma_dom)
- return NULL;
-
- spin_lock_init(&dma_dom->domain.lock);
-
- dma_dom->domain.id = domain_id_alloc();
- if (dma_dom->domain.id == 0)
- goto free_dma_dom;
- INIT_LIST_HEAD(&dma_dom->domain.dev_list);
- dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
- dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- dma_dom->domain.flags = PD_DMA_OPS_MASK;
- dma_dom->domain.priv = dma_dom;
- if (!dma_dom->domain.pt_root)
- goto free_dma_dom;
-
- dma_dom->need_flush = false;
- dma_dom->target_dev = 0xffff;
-
- add_domain_to_list(&dma_dom->domain);
-
- if (alloc_new_range(dma_dom, true, GFP_KERNEL))
- goto free_dma_dom;
-
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->aperture[0]->bitmap[0] = 1;
- dma_dom->next_address = 0;
-
-
- return dma_dom;
-
-free_dma_dom:
- dma_ops_domain_free(dma_dom);
-
- return NULL;
-}
-
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
- return domain->flags & PD_DMA_OPS_MASK;
-}
-
-static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
-{
- u64 pte_root = virt_to_phys(domain->pt_root);
- u32 flags = 0;
-
- pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-
- if (ats)
- flags |= DTE_FLAG_IOTLB;
-
- amd_iommu_dev_table[devid].data[3] |= flags;
- amd_iommu_dev_table[devid].data[2] = domain->id;
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-}
-
-static void clear_dte_entry(u16 devid)
-{
- /* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
-
- amd_iommu_apply_erratum_63(devid);
-}
-
-static void do_attach(struct device *dev, struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- struct pci_dev *pdev;
- bool ats = false;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
- pdev = to_pci_dev(dev);
-
- if (amd_iommu_iotlb_sup)
- ats = pci_ats_enabled(pdev);
-
- /* Update data structures */
- dev_data->domain = domain;
- list_add(&dev_data->list, &domain->dev_list);
- set_dte_entry(devid, domain, ats);
-
- /* Do reference counting */
- domain->dev_iommu[iommu->index] += 1;
- domain->dev_cnt += 1;
-
- /* Flush the DTE entry */
- device_flush_dte(dev);
-}
-
-static void do_detach(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
-
- /* decrease reference counters */
- dev_data->domain->dev_iommu[iommu->index] -= 1;
- dev_data->domain->dev_cnt -= 1;
-
- /* Update data structures */
- dev_data->domain = NULL;
- list_del(&dev_data->list);
- clear_dte_entry(devid);
-
- /* Flush the DTE entry */
- device_flush_dte(dev);
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int __attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *alias_data;
- int ret;
-
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
-
- if (!alias_data)
- return -EINVAL;
-
- /* lock domain */
- spin_lock(&domain->lock);
-
- /* Some sanity checks */
- ret = -EBUSY;
- if (alias_data->domain != NULL &&
- alias_data->domain != domain)
- goto out_unlock;
-
- if (dev_data->domain != NULL &&
- dev_data->domain != domain)
- goto out_unlock;
-
- /* Do real assignment */
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (alias_data->domain == NULL)
- do_attach(dev_data->alias, domain);
-
- atomic_inc(&alias_data->bind);
- }
-
- if (dev_data->domain == NULL)
- do_attach(dev, domain);
-
- atomic_inc(&dev_data->bind);
-
- ret = 0;
-
-out_unlock:
-
- /* ready */
- spin_unlock(&domain->lock);
-
- return ret;
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
- unsigned long flags;
- int ret;
-
- if (amd_iommu_iotlb_sup)
- pci_enable_ats(pdev, PAGE_SHIFT);
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- ret = __attach_device(dev, domain);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
- domain_flush_tlb_pde(domain);
-
- return ret;
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data = get_dev_data(dev);
- struct iommu_dev_data *alias_data;
- struct protection_domain *domain;
- unsigned long flags;
-
- BUG_ON(!dev_data->domain);
-
- domain = dev_data->domain;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (atomic_dec_and_test(&alias_data->bind))
- do_detach(dev_data->alias);
- }
-
- if (atomic_dec_and_test(&dev_data->bind))
- do_detach(dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- /*
- * If we run in passthrough mode the device must be assigned to the
- * passthrough domain if it is detached from any other domain.
- * Make sure we can deassign from the pt_domain itself.
- */
- if (iommu_pass_through &&
- (dev_data->domain == NULL && domain != pt_domain))
- __attach_device(dev, pt_domain);
-}
-
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
- unsigned long flags;
-
- /* lock device table */
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(dev);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
- pci_disable_ats(pdev);
-}
-
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(struct device *dev)
-{
- struct protection_domain *dom;
- struct iommu_dev_data *dev_data, *alias_data;
- unsigned long flags;
- u16 devid;
-
- devid = get_device_id(dev);
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
- if (!alias_data)
- return NULL;
-
- read_lock_irqsave(&amd_iommu_devtable_lock, flags);
- dom = dev_data->domain;
- if (dom == NULL &&
- alias_data->domain != NULL) {
- __attach_device(dev, alias_data->domain);
- dom = alias_data->domain;
- }
-
- read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return dom;
-}
-
-static int device_change_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct device *dev = data;
- u16 devid;
- struct protection_domain *domain;
- struct dma_ops_domain *dma_domain;
- struct amd_iommu *iommu;
- unsigned long flags;
-
- if (!check_device(dev))
- return 0;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- switch (action) {
- case BUS_NOTIFY_UNBOUND_DRIVER:
-
- domain = domain_for_device(dev);
-
- if (!domain)
- goto out;
- if (iommu_pass_through)
- break;
- detach_device(dev);
- break;
- case BUS_NOTIFY_ADD_DEVICE:
-
- iommu_init_device(dev);
-
- domain = domain_for_device(dev);
-
- /* allocate a protection domain if a device is added */
- dma_domain = find_protection_domain(devid);
- if (dma_domain)
- goto out;
- dma_domain = dma_ops_domain_alloc();
- if (!dma_domain)
- goto out;
- dma_domain->target_dev = devid;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
- list_add_tail(&dma_domain->list, &iommu_pd_list);
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- break;
- case BUS_NOTIFY_DEL_DEVICE:
-
- iommu_uninit_device(dev);
-
- default:
- goto out;
- }
-
- device_flush_dte(dev);
- iommu_completion_wait(iommu);
-
-out:
- return 0;
-}
-
-static struct notifier_block device_nb = {
- .notifier_call = device_change_notifier,
-};
-
-void amd_iommu_init_notifier(void)
-{
- bus_register_notifier(&pci_bus_type, &device_nb);
-}
-
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
- struct protection_domain *domain;
- struct dma_ops_domain *dma_dom;
- u16 devid = get_device_id(dev);
-
- if (!check_device(dev))
- return ERR_PTR(-EINVAL);
-
- domain = domain_for_device(dev);
- if (domain != NULL && !dma_ops_domain(domain))
- return ERR_PTR(-EBUSY);
-
- if (domain != NULL)
- return domain;
-
- /* Device not bount yet - bind it */
- dma_dom = find_protection_domain(devid);
- if (!dma_dom)
- dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
- attach_device(dev, &dma_dom->domain);
- DUMP_printk("Using protection domain %d for device %s\n",
- dma_dom->domain.id, dev_name(dev));
-
- return &dma_dom->domain;
-}
-
-static void update_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- struct pci_dev *pdev = to_pci_dev(dev_data->dev);
- u16 devid = get_device_id(dev_data->dev);
- set_dte_entry(devid, domain, pci_ats_enabled(pdev));
- }
-}
-
-static void update_domain(struct protection_domain *domain)
-{
- if (!domain->updated)
- return;
-
- update_device_table(domain);
-
- domain_flush_devices(domain);
- domain_flush_tlb_pde(domain);
-
- domain->updated = false;
-}
-
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte, *pte_page;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return NULL;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte) {
- pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
- GFP_ATOMIC);
- aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
- } else
- pte += PM_LEVEL_INDEX(0, address);
-
- update_domain(&dom->domain);
-
- return pte;
-}
-
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
- unsigned long address,
- phys_addr_t paddr,
- int direction)
-{
- u64 *pte, __pte;
-
- WARN_ON(address > dom->aperture_size);
-
- paddr &= PAGE_MASK;
-
- pte = dma_ops_get_pte(dom, address);
- if (!pte)
- return DMA_ERROR_CODE;
-
- __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (direction == DMA_TO_DEVICE)
- __pte |= IOMMU_PTE_IR;
- else if (direction == DMA_FROM_DEVICE)
- __pte |= IOMMU_PTE_IW;
- else if (direction == DMA_BIDIRECTIONAL)
- __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-
- WARN_ON(*pte);
-
- *pte = __pte;
-
- return (dma_addr_t)address;
-}
-
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte;
-
- if (address >= dom->aperture_size)
- return;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte)
- return;
-
- pte += PM_LEVEL_INDEX(0, address);
-
- WARN_ON(!*pte);
-
- *pte = 0ULL;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
- struct dma_ops_domain *dma_dom,
- phys_addr_t paddr,
- size_t size,
- int dir,
- bool align,
- u64 dma_mask)
-{
- dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start, ret;
- unsigned int pages;
- unsigned long align_mask = 0;
- int i;
-
- pages = iommu_num_pages(paddr, size, PAGE_SIZE);
- paddr &= PAGE_MASK;
-
- INC_STATS_COUNTER(total_map_requests);
-
- if (pages > 1)
- INC_STATS_COUNTER(cross_page);
-
- if (align)
- align_mask = (1UL << get_order(size)) - 1;
-
-retry:
- address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
- dma_mask);
- if (unlikely(address == DMA_ERROR_CODE)) {
- /*
- * setting next_address here will let the address
- * allocator only scan the new allocated range in the
- * first run. This is a small optimization.
- */
- dma_dom->next_address = dma_dom->aperture_size;
-
- if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
- goto out;
-
- /*
- * aperture was successfully enlarged by 128 MB, try
- * allocation again
- */
- goto retry;
- }
-
- start = address;
- for (i = 0; i < pages; ++i) {
- ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
- if (ret == DMA_ERROR_CODE)
- goto out_unmap;
-
- paddr += PAGE_SIZE;
- start += PAGE_SIZE;
- }
- address += offset;
-
- ADD_STATS_COUNTER(alloced_io_mem, size);
-
- if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- domain_flush_tlb(&dma_dom->domain);
- dma_dom->need_flush = false;
- } else if (unlikely(amd_iommu_np_cache))
- domain_flush_pages(&dma_dom->domain, address, size);
-
-out:
- return address;
-
-out_unmap:
-
- for (--i; i >= 0; --i) {
- start -= PAGE_SIZE;
- dma_ops_domain_unmap(dma_dom, start);
- }
-
- dma_ops_free_addresses(dma_dom, address, pages);
-
- return DMA_ERROR_CODE;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
- dma_addr_t dma_addr,
- size_t size,
- int dir)
-{
- dma_addr_t flush_addr;
- dma_addr_t i, start;
- unsigned int pages;
-
- if ((dma_addr == DMA_ERROR_CODE) ||
- (dma_addr + size > dma_dom->aperture_size))
- return;
-
- flush_addr = dma_addr;
- pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- dma_addr &= PAGE_MASK;
- start = dma_addr;
-
- for (i = 0; i < pages; ++i) {
- dma_ops_domain_unmap(dma_dom, start);
- start += PAGE_SIZE;
- }
-
- SUB_STATS_COUNTER(alloced_io_mem, size);
-
- dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
- if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- domain_flush_pages(&dma_dom->domain, flush_addr, size);
- dma_dom->need_flush = false;
- }
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- dma_addr_t addr;
- u64 dma_mask;
- phys_addr_t paddr = page_to_phys(page) + offset;
-
- INC_STATS_COUNTER(cnt_map_single);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return (dma_addr_t)paddr;
- else if (IS_ERR(domain))
- return DMA_ERROR_CODE;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- addr = __map_single(dev, domain->priv, paddr, size, dir, false,
- dma_mask);
- if (addr == DMA_ERROR_CODE)
- goto out;
-
- domain_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return addr;
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
- enum dma_data_direction dir, struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_unmap_single);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, dir);
-
- domain_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
- int nelems, int dir)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sglist, s, nelems, i) {
- s->dma_address = (dma_addr_t)sg_phys(s);
- s->dma_length = s->length;
- }
-
- return nelems;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- int i;
- struct scatterlist *s;
- phys_addr_t paddr;
- int mapped_elems = 0;
- u64 dma_mask;
-
- INC_STATS_COUNTER(cnt_map_sg);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return map_sg_no_iommu(dev, sglist, nelems, dir);
- else if (IS_ERR(domain))
- return 0;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- paddr = sg_phys(s);
-
- s->dma_address = __map_single(dev, domain->priv,
- paddr, s->length, dir, false,
- dma_mask);
-
- if (s->dma_address) {
- s->dma_length = s->length;
- mapped_elems++;
- } else
- goto unmap;
- }
-
- domain_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return mapped_elems;
-unmap:
- for_each_sg(sglist, s, mapped_elems, i) {
- if (s->dma_address)
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- mapped_elems = 0;
-
- goto out;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- struct scatterlist *s;
- int i;
-
- INC_STATS_COUNTER(cnt_unmap_sg);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- domain_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
-{
- unsigned long flags;
- void *virt_addr;
- struct protection_domain *domain;
- phys_addr_t paddr;
- u64 dma_mask = dev->coherent_dma_mask;
-
- INC_STATS_COUNTER(cnt_alloc_coherent);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL) {
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- *dma_addr = __pa(virt_addr);
- return virt_addr;
- } else if (IS_ERR(domain))
- return NULL;
-
- dma_mask = dev->coherent_dma_mask;
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- flag |= __GFP_ZERO;
-
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- if (!virt_addr)
- return NULL;
-
- paddr = virt_to_phys(virt_addr);
-
- if (!dma_mask)
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- *dma_addr = __map_single(dev, domain->priv, paddr,
- size, DMA_BIDIRECTIONAL, true, dma_mask);
-
- if (*dma_addr == DMA_ERROR_CODE) {
- spin_unlock_irqrestore(&domain->lock, flags);
- goto out_free;
- }
-
- domain_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return virt_addr;
-
-out_free:
-
- free_pages((unsigned long)virt_addr, get_order(size));
-
- return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
- void *virt_addr, dma_addr_t dma_addr)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_free_coherent);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- goto free_mem;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-
- domain_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
-free_mem:
- free_pages((unsigned long)virt_addr, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
- return check_device(dev);
-}
-
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
- struct pci_dev *dev = NULL;
- struct dma_ops_domain *dma_dom;
- u16 devid;
-
- for_each_pci_dev(dev) {
-
- /* Do we handle this device? */
- if (!check_device(&dev->dev))
- continue;
-
- /* Is there already any domain for it? */
- if (domain_for_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- dma_dom = dma_ops_domain_alloc();
- if (!dma_dom)
- continue;
- init_unity_mappings_for_device(dma_dom, devid);
- dma_dom->target_dev = devid;
-
- attach_device(&dev->dev, &dma_dom->domain);
-
- list_add_tail(&dma_dom->list, &iommu_pd_list);
- }
-}
-
-static struct dma_map_ops amd_iommu_dma_ops = {
- .alloc_coherent = alloc_coherent,
- .free_coherent = free_coherent,
- .map_page = map_page,
- .unmap_page = unmap_page,
- .map_sg = map_sg,
- .unmap_sg = unmap_sg,
- .dma_supported = amd_iommu_dma_supported,
-};
-
-static unsigned device_dma_ops_init(void)
-{
- struct pci_dev *pdev = NULL;
- unsigned unhandled = 0;
-
- for_each_pci_dev(pdev) {
- if (!check_device(&pdev->dev)) {
- unhandled += 1;
- continue;
- }
-
- pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
- }
-
- return unhandled;
-}
-
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-
-void __init amd_iommu_init_api(void)
-{
- register_iommu(&amd_iommu_ops);
-}
-
-int __init amd_iommu_init_dma_ops(void)
-{
- struct amd_iommu *iommu;
- int ret, unhandled;
-
- /*
- * first allocate a default protection domain for every IOMMU we
- * found in the system. Devices not assigned to any other
- * protection domain will be assigned to the default one.
- */
- for_each_iommu(iommu) {
- iommu->default_dom = dma_ops_domain_alloc();
- if (iommu->default_dom == NULL)
- return -ENOMEM;
- iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
- ret = iommu_init_unity_mappings(iommu);
- if (ret)
- goto free_domains;
- }
-
- /*
- * Pre-allocate the protection domains for each device.
- */
- prealloc_protection_domains();
-
- iommu_detected = 1;
- swiotlb = 0;
-
- /* Make the driver finally visible to the drivers */
- unhandled = device_dma_ops_init();
- if (unhandled && max_pfn > MAX_DMA32_PFN) {
- /* There are unhandled devices - initialize swiotlb for them */
- swiotlb = 1;
- }
-
- amd_iommu_stats_init();
-
- return 0;
-
-free_domains:
-
- for_each_iommu(iommu) {
- if (iommu->default_dom)
- dma_ops_domain_free(iommu->default_dom);
- }
-
- return ret;
-}
-
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-
-static void cleanup_domain(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *next;
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-
- list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
- struct device *dev = dev_data->dev;
-
- __detach_device(dev);
- atomic_set(&dev_data->bind, 0);
- }
-
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void protection_domain_free(struct protection_domain *domain)
-{
- if (!domain)
- return;
-
- del_domain_from_list(domain);
-
- if (domain->id)
- domain_id_free(domain->id);
-
- kfree(domain);
-}
-
-static struct protection_domain *protection_domain_alloc(void)
-{
- struct protection_domain *domain;
-
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return NULL;
-
- spin_lock_init(&domain->lock);
- mutex_init(&domain->api_lock);
- domain->id = domain_id_alloc();
- if (!domain->id)
- goto out_err;
- INIT_LIST_HEAD(&domain->dev_list);
-
- add_domain_to_list(domain);
-
- return domain;
-
-out_err:
- kfree(domain);
-
- return NULL;
-}
-
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
- struct protection_domain *domain;
-
- domain = protection_domain_alloc();
- if (!domain)
- goto out_free;
-
- domain->mode = PAGE_MODE_3_LEVEL;
- domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- if (!domain->pt_root)
- goto out_free;
-
- dom->priv = domain;
-
- return 0;
-
-out_free:
- protection_domain_free(domain);
-
- return -ENOMEM;
-}
-
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
- struct protection_domain *domain = dom->priv;
-
- if (!domain)
- return;
-
- if (domain->dev_cnt > 0)
- cleanup_domain(domain);
-
- BUG_ON(domain->dev_cnt != 0);
-
- free_pagetable(domain);
-
- protection_domain_free(domain);
-
- dom->priv = NULL;
-}
-
-static void amd_iommu_detach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct iommu_dev_data *dev_data = dev->archdata.iommu;
- struct amd_iommu *iommu;
- u16 devid;
-
- if (!check_device(dev))
- return;
-
- devid = get_device_id(dev);
-
- if (dev_data->domain != NULL)
- detach_device(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return;
-
- device_flush_dte(dev);
- iommu_completion_wait(iommu);
-}
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct protection_domain *domain = dom->priv;
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- int ret;
- u16 devid;
-
- if (!check_device(dev))
- return -EINVAL;
-
- dev_data = dev->archdata.iommu;
-
- devid = get_device_id(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return -EINVAL;
-
- if (dev_data->domain)
- detach_device(dev);
-
- ret = attach_device(dev, domain);
-
- iommu_completion_wait(iommu);
-
- return ret;
-}
-
-static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, int gfp_order, int iommu_prot)
-{
- unsigned long page_size = 0x1000UL << gfp_order;
- struct protection_domain *domain = dom->priv;
- int prot = 0;
- int ret;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- mutex_lock(&domain->api_lock);
- ret = iommu_map_page(domain, iova, paddr, prot, page_size);
- mutex_unlock(&domain->api_lock);
-
- return ret;
-}
-
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
- int gfp_order)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long page_size, unmap_size;
-
- page_size = 0x1000UL << gfp_order;
-
- mutex_lock(&domain->api_lock);
- unmap_size = iommu_unmap_page(domain, iova, page_size);
- mutex_unlock(&domain->api_lock);
-
- domain_flush_tlb_pde(domain);
-
- return get_order(unmap_size);
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- unsigned long iova)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long offset_mask;
- phys_addr_t paddr;
- u64 *pte, __pte;
-
- pte = fetch_pte(domain, iova);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- if (PM_PTE_LEVEL(*pte) == 0)
- offset_mask = PAGE_SIZE - 1;
- else
- offset_mask = PTE_PAGE_SIZE(*pte) - 1;
-
- __pte = *pte & PM_ADDR_MASK;
- paddr = (__pte & ~offset_mask) | (iova & offset_mask);
-
- return paddr;
-}
-
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
- unsigned long cap)
-{
- switch (cap) {
- case IOMMU_CAP_CACHE_COHERENCY:
- return 1;
- }
-
- return 0;
-}
-
-static struct iommu_ops amd_iommu_ops = {
- .domain_init = amd_iommu_domain_init,
- .domain_destroy = amd_iommu_domain_destroy,
- .attach_dev = amd_iommu_attach_device,
- .detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map,
- .unmap = amd_iommu_unmap,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .domain_has_cap = amd_iommu_domain_has_cap,
-};
-
-/*****************************************************************************
- *
- * The next functions do a basic initialization of IOMMU for pass through
- * mode
- *
- * In passthrough mode the IOMMU is initialized and enabled but not used for
- * DMA-API translation.
- *
- *****************************************************************************/
-
-int __init amd_iommu_init_passthrough(void)
-{
- struct amd_iommu *iommu;
- struct pci_dev *dev = NULL;
- u16 devid;
-
- /* allocate passthrough domain */
- pt_domain = protection_domain_alloc();
- if (!pt_domain)
- return -ENOMEM;
-
- pt_domain->mode |= PAGE_MODE_NONE;
-
- for_each_pci_dev(dev) {
- if (!check_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- continue;
-
- attach_device(&dev->dev, pt_domain);
- }
-
- pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
-
- return 0;
-}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index bfc8453bd98..00000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1572 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE 0x10
-#define ACPI_IVMD_TYPE_ALL 0x20
-#define ACPI_IVMD_TYPE 0x21
-#define ACPI_IVMD_TYPE_RANGE 0x22
-
-#define IVHD_DEV_ALL 0x01
-#define IVHD_DEV_SELECT 0x02
-#define IVHD_DEV_SELECT_RANGE_START 0x03
-#define IVHD_DEV_RANGE_END 0x04
-#define IVHD_DEV_ALIAS 0x42
-#define IVHD_DEV_ALIAS_RANGE 0x43
-#define IVHD_DEV_EXT_SELECT 0x46
-#define IVHD_DEV_EXT_SELECT_RANGE 0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
-#define IVHD_FLAG_PASSPW_EN_MASK 0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
-#define IVHD_FLAG_ISOC_EN_MASK 0x08
-
-#define IVMD_FLAG_EXCL_RANGE 0x08
-#define IVMD_FLAG_UNITY_MAP 0x01
-
-#define ACPI_DEVFLAG_INITPASS 0x01
-#define ACPI_DEVFLAG_EXTINT 0x02
-#define ACPI_DEVFLAG_NMI 0x04
-#define ACPI_DEVFLAG_SYSMGT1 0x10
-#define ACPI_DEVFLAG_SYSMGT2 0x20
-#define ACPI_DEVFLAG_LINT0 0x40
-#define ACPI_DEVFLAG_LINT1 0x80
-#define ACPI_DEVFLAG_ATSDIS 0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 cap_ptr;
- u64 mmio_phys;
- u16 pci_seg;
- u16 info;
- u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
- u8 type;
- u16 devid;
- u8 flags;
- u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 aux;
- u64 resv;
- u64 range_start;
- u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-
-u16 amd_iommu_last_bdf; /* largest PCI device id we have
- to handle */
-LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
- we find in ACPI */
-bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
- system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-bool amd_iommu_iotlb_sup __read_mostly = true;
-
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size; /* size of the device table */
-static u32 alias_table_size; /* size of the alias table */
-static u32 rlookup_table_size; /* size if the rlookup table */
-
-/*
- * This function flushes all internal caches of
- * the IOMMU used by this driver.
- */
-extern void iommu_flush_all_caches(struct amd_iommu *iommu);
-
-static inline void update_last_devid(u16 devid)
-{
- if (devid > amd_iommu_last_bdf)
- amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
- unsigned shift = PAGE_SHIFT +
- get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
- return 1UL << shift;
-}
-
-/* Access to l1 and l2 indexed register spaces */
-
-static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
-{
- u32 val;
-
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
- pci_read_config_dword(iommu->dev, 0xfc, &val);
- return val;
-}
-
-static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
-{
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
- pci_write_config_dword(iommu->dev, 0xfc, val);
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-}
-
-static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
-{
- u32 val;
-
- pci_write_config_dword(iommu->dev, 0xf0, address);
- pci_read_config_dword(iommu->dev, 0xf4, &val);
- return val;
-}
-
-static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
-{
- pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
- pci_write_config_dword(iommu->dev, 0xf4, val);
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
- u64 start = iommu->exclusion_start & PAGE_MASK;
- u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
- u64 entry;
-
- if (!iommu->exclusion_start)
- return;
-
- entry = start | MMIO_EXCL_ENABLE_MASK;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
- &entry, sizeof(entry));
-
- entry = limit;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->mmio_base == NULL);
-
- entry = virt_to_phys(amd_iommu_dev_table);
- entry |= (dev_table_size >> 12) - 1;
- memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl |= (1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl &= ~(1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
- static const char * const feat_str[] = {
- "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
- "IA", "GA", "HE", "PC", NULL
- };
- int i;
-
- printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
- dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
- if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
- printk(KERN_CONT " extended features: ");
- for (i = 0; feat_str[i]; ++i)
- if (iommu_feature(iommu, (1ULL << i)))
- printk(KERN_CONT " %s", feat_str[i]);
- }
- printk(KERN_CONT "\n");
-
- iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
- /* Disable command buffer */
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- /* Disable event logging and event interrupts */
- iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
- iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
- /* Disable IOMMU hardware itself */
- iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
- u8 *ret;
-
- if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
- pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
- address);
- pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
- return NULL;
- }
-
- ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
- if (ret != NULL)
- return ret;
-
- release_mem_region(address, MMIO_REGION_LENGTH);
-
- return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
- if (iommu->mmio_base)
- iounmap(iommu->mmio_base);
- release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
- return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
- u32 cap;
-
- cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
- update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
- return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
- u8 *p = (void *)h, *end = (void *)h;
- struct ivhd_entry *dev;
-
- p += sizeof(*h);
- end += h->length;
-
- find_last_devid_on_pci(PCI_BUS(h->devid),
- PCI_SLOT(h->devid),
- PCI_FUNC(h->devid),
- h->cap_ptr);
-
- while (p < end) {
- dev = (struct ivhd_entry *)p;
- switch (dev->type) {
- case IVHD_DEV_SELECT:
- case IVHD_DEV_RANGE_END:
- case IVHD_DEV_ALIAS:
- case IVHD_DEV_EXT_SELECT:
- /* all the above subfield types refer to device ids */
- update_last_devid(dev->devid);
- break;
- default:
- break;
- }
- p += ivhd_entry_length(p);
- }
-
- WARN_ON(p != end);
-
- return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
- int i;
- u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
-
- /*
- * Validate checksum here so we don't need to do it when
- * we actually parse the table
- */
- for (i = 0; i < table->length; ++i)
- checksum += p[i];
- if (checksum != 0) {
- /* ACPI table corrupt */
- amd_iommu_init_err = -ENODEV;
- return 0;
- }
-
- p += IVRS_HEADER_LENGTH;
-
- end += table->length;
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (h->type) {
- case ACPI_IVHD_TYPE:
- find_last_devid_from_ivhd(h);
- break;
- default:
- break;
- }
- p += h->length;
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
- u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(CMD_BUFFER_SIZE));
-
- if (cmd_buf == NULL)
- return NULL;
-
- iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-
- return cmd_buf;
-}
-
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->cmd_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->cmd_buf);
- entry |= MMIO_CMD_SIZE_512;
-
- memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
-
- amd_iommu_reset_cmd_buffer(iommu);
- iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->cmd_buf,
- get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
- iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(EVT_BUFFER_SIZE));
-
- if (iommu->evt_buf == NULL)
- return NULL;
-
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
- return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->evt_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
- memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
- &entry, sizeof(entry));
-
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-
-
-void amd_iommu_apply_erratum_63(u16 devid)
-{
- int sysmgt;
-
- sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
- (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-
- if (sysmgt == 0x01)
- set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
- amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
- u16 devid, u32 flags, u32 ext_flags)
-{
- if (flags & ACPI_DEVFLAG_INITPASS)
- set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
- if (flags & ACPI_DEVFLAG_EXTINT)
- set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
- if (flags & ACPI_DEVFLAG_NMI)
- set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
- if (flags & ACPI_DEVFLAG_SYSMGT1)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
- if (flags & ACPI_DEVFLAG_SYSMGT2)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
- if (flags & ACPI_DEVFLAG_LINT0)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
- if (flags & ACPI_DEVFLAG_LINT1)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
- amd_iommu_apply_erratum_63(devid);
-
- set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
- if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
- return;
-
- if (iommu) {
- /*
- * We only can configure exclusion ranges per IOMMU, not
- * per device. But we can enable the exclusion range per
- * device. This is done here
- */
- set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
- iommu->exclusion_start = m->range_start;
- iommu->exclusion_length = m->range_length;
- }
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
- int cap_ptr = iommu->cap_ptr;
- u32 range, misc, low, high;
- int i, j;
-
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
- &iommu->cap);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
- &range);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
- &misc);
-
- iommu->first_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_FD(range));
- iommu->last_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_LD(range));
- iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-
- if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
- amd_iommu_iotlb_sup = false;
-
- /* read extended feature bits */
- low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
- high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
-
- iommu->features = ((u64)high << 32) | low;
-
- if (!is_rd890_iommu(iommu->dev))
- return;
-
- /*
- * Some rd890 systems may not be fully reconfigured by the BIOS, so
- * it's necessary for us to store this information so it can be
- * reprogrammed on resume
- */
-
- pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
- &iommu->stored_addr_lo);
- pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
- &iommu->stored_addr_hi);
-
- /* Low bit locks writes to configuration space */
- iommu->stored_addr_lo &= ~1;
-
- for (i = 0; i < 6; i++)
- for (j = 0; j < 0x12; j++)
- iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
-
- for (i = 0; i < 0x83; i++)
- iommu->stored_l2[i] = iommu_read_l2(iommu, i);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
- struct ivhd_header *h)
-{
- u8 *p = (u8 *)h;
- u8 *end = p, flags = 0;
- u16 devid = 0, devid_start = 0, devid_to = 0;
- u32 dev_i, ext_flags = 0;
- bool alias = false;
- struct ivhd_entry *e;
-
- /*
- * First save the recommended feature enable bits from ACPI
- */
- iommu->acpi_flags = h->flags;
-
- /*
- * Done. Now parse the device entries
- */
- p += sizeof(struct ivhd_header);
- end += h->length;
-
-
- while (p < end) {
- e = (struct ivhd_entry *)p;
- switch (e->type) {
- case IVHD_DEV_ALL:
-
- DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
- " last device %02x:%02x.%x flags: %02x\n",
- PCI_BUS(iommu->first_device),
- PCI_SLOT(iommu->first_device),
- PCI_FUNC(iommu->first_device),
- PCI_BUS(iommu->last_device),
- PCI_SLOT(iommu->last_device),
- PCI_FUNC(iommu->last_device),
- e->flags);
-
- for (dev_i = iommu->first_device;
- dev_i <= iommu->last_device; ++dev_i)
- set_dev_entry_from_acpi(iommu, dev_i,
- e->flags, 0);
- break;
- case IVHD_DEV_SELECT:
-
- DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
- "flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
- break;
- case IVHD_DEV_SELECT_RANGE_START:
-
- DUMP_printk(" DEV_SELECT_RANGE_START\t "
- "devid: %02x:%02x.%x flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = 0;
- alias = false;
- break;
- case IVHD_DEV_ALIAS:
-
- DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
- "flags: %02x devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid = e->devid;
- devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
- set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
- amd_iommu_alias_table[devid] = devid_to;
- break;
- case IVHD_DEV_ALIAS_RANGE:
-
- DUMP_printk(" DEV_ALIAS_RANGE\t\t "
- "devid: %02x:%02x.%x flags: %02x "
- "devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid_start = e->devid;
- flags = e->flags;
- devid_to = e->ext >> 8;
- ext_flags = 0;
- alias = true;
- break;
- case IVHD_DEV_EXT_SELECT:
-
- DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
- "flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags,
- e->ext);
- break;
- case IVHD_DEV_EXT_SELECT_RANGE:
-
- DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
- "%02x:%02x.%x flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = e->ext;
- alias = false;
- break;
- case IVHD_DEV_RANGE_END:
-
- DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid));
-
- devid = e->devid;
- for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
- if (alias) {
- amd_iommu_alias_table[dev_i] = devid_to;
- set_dev_entry_from_acpi(iommu,
- devid_to, flags, ext_flags);
- }
- set_dev_entry_from_acpi(iommu, dev_i,
- flags, ext_flags);
- }
- break;
- default:
- break;
- }
-
- p += ivhd_entry_length(p);
- }
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
- u32 i;
-
- for (i = iommu->first_device; i <= iommu->last_device; ++i)
- set_iommu_for_device(iommu, i);
-
- return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
- free_command_buffer(iommu);
- free_event_buffer(iommu);
- iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
- struct amd_iommu *iommu, *next;
-
- for_each_iommu_safe(iommu, next) {
- list_del(&iommu->list);
- free_iommu_one(iommu);
- kfree(iommu);
- }
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
- spin_lock_init(&iommu->lock);
-
- /* Add IOMMU to internal data structures */
- list_add_tail(&iommu->list, &amd_iommu_list);
- iommu->index = amd_iommus_present++;
-
- if (unlikely(iommu->index >= MAX_IOMMUS)) {
- WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
- return -ENOSYS;
- }
-
- /* Index is fine - add IOMMU to the array */
- amd_iommus[iommu->index] = iommu;
-
- /*
- * Copy data from ACPI table entry to the iommu struct
- */
- iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
- if (!iommu->dev)
- return 1;
-
- iommu->cap_ptr = h->cap_ptr;
- iommu->pci_seg = h->pci_seg;
- iommu->mmio_phys = h->mmio_phys;
- iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
- if (!iommu->mmio_base)
- return -ENOMEM;
-
- iommu->cmd_buf = alloc_command_buffer(iommu);
- if (!iommu->cmd_buf)
- return -ENOMEM;
-
- iommu->evt_buf = alloc_event_buffer(iommu);
- if (!iommu->evt_buf)
- return -ENOMEM;
-
- iommu->int_enabled = false;
-
- init_iommu_from_pci(iommu);
- init_iommu_from_acpi(iommu, h);
- init_iommu_devices(iommu);
-
- if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
- amd_iommu_np_cache = true;
-
- return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
- struct amd_iommu *iommu;
- int ret;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (*p) {
- case ACPI_IVHD_TYPE:
-
- DUMP_printk("device: %02x:%02x.%01x cap: %04x "
- "seg: %d flags: %01x info %04x\n",
- PCI_BUS(h->devid), PCI_SLOT(h->devid),
- PCI_FUNC(h->devid), h->cap_ptr,
- h->pci_seg, h->flags, h->info);
- DUMP_printk(" mmio-addr: %016llx\n",
- h->mmio_phys);
-
- iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
- if (iommu == NULL) {
- amd_iommu_init_err = -ENOMEM;
- return 0;
- }
-
- ret = init_iommu_one(iommu, h);
- if (ret) {
- amd_iommu_init_err = ret;
- return 0;
- }
- break;
- default:
- break;
- }
- p += h->length;
-
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
- int r;
-
- if (pci_enable_msi(iommu->dev))
- return 1;
-
- r = request_threaded_irq(iommu->dev->irq,
- amd_iommu_int_handler,
- amd_iommu_int_thread,
- 0, "AMD-Vi",
- iommu->dev);
-
- if (r) {
- pci_disable_msi(iommu->dev);
- return 1;
- }
-
- iommu->int_enabled = true;
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
- return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
- if (iommu->int_enabled)
- return 0;
-
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
- return iommu_setup_msi(iommu);
-
- return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
- struct unity_map_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
- list_del(&entry->list);
- kfree(entry);
- }
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
- int i;
-
- switch (m->type) {
- case ACPI_IVMD_TYPE:
- set_device_exclusion_range(m->devid, m);
- break;
- case ACPI_IVMD_TYPE_ALL:
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- set_device_exclusion_range(i, m);
- break;
- case ACPI_IVMD_TYPE_RANGE:
- for (i = m->devid; i <= m->aux; ++i)
- set_device_exclusion_range(i, m);
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
- struct unity_map_entry *e = 0;
- char *s;
-
- e = kzalloc(sizeof(*e), GFP_KERNEL);
- if (e == NULL)
- return -ENOMEM;
-
- switch (m->type) {
- default:
- kfree(e);
- return 0;
- case ACPI_IVMD_TYPE:
- s = "IVMD_TYPEi\t\t\t";
- e->devid_start = e->devid_end = m->devid;
- break;
- case ACPI_IVMD_TYPE_ALL:
- s = "IVMD_TYPE_ALL\t\t";
- e->devid_start = 0;
- e->devid_end = amd_iommu_last_bdf;
- break;
- case ACPI_IVMD_TYPE_RANGE:
- s = "IVMD_TYPE_RANGE\t\t";
- e->devid_start = m->devid;
- e->devid_end = m->aux;
- break;
- }
- e->address_start = PAGE_ALIGN(m->range_start);
- e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
- e->prot = m->flags >> 1;
-
- DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
- " range_start: %016llx range_end: %016llx flags: %x\n", s,
- PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
- PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
- PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
- e->address_start, e->address_end, m->flags);
-
- list_add_tail(&e->list, &amd_iommu_unity_map);
-
- return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivmd_header *m;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- m = (struct ivmd_header *)p;
- if (m->flags & IVMD_FLAG_EXCL_RANGE)
- init_exclusion_range(m);
- else if (m->flags & IVMD_FLAG_UNITY_MAP)
- init_unity_map_range(m);
-
- p += m->length;
- }
-
- return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
- u32 devid;
-
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
- set_dev_entry_bit(devid, DEV_ENTRY_VALID);
- set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
- }
-}
-
-static void iommu_init_flags(struct amd_iommu *iommu)
-{
- iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
- iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
- iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
- iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
- iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
- iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
- /*
- * make IOMMU memory accesses cache coherent
- */
- iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-}
-
-static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
-{
- int i, j;
- u32 ioc_feature_control;
- struct pci_dev *pdev = NULL;
-
- /* RD890 BIOSes may not have completely reconfigured the iommu */
- if (!is_rd890_iommu(iommu->dev))
- return;
-
- /*
- * First, we need to ensure that the iommu is enabled. This is
- * controlled by a register in the northbridge
- */
- pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
-
- if (!pdev)
- return;
-
- /* Select Northbridge indirect register 0x75 and enable writing */
- pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
- pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
-
- /* Enable the iommu */
- if (!(ioc_feature_control & 0x1))
- pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
-
- pci_dev_put(pdev);
-
- /* Restore the iommu BAR */
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
- iommu->stored_addr_lo);
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
- iommu->stored_addr_hi);
-
- /* Restore the l1 indirect regs for each of the 6 l1s */
- for (i = 0; i < 6; i++)
- for (j = 0; j < 0x12; j++)
- iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
-
- /* Restore the l2 indirect regs */
- for (i = 0; i < 0x83; i++)
- iommu_write_l2(iommu, i, iommu->stored_l2[i]);
-
- /* Lock PCI setup registers */
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
- iommu->stored_addr_lo | 1);
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu) {
- iommu_disable(iommu);
- iommu_init_flags(iommu);
- iommu_set_device_table(iommu);
- iommu_enable_command_buffer(iommu);
- iommu_enable_event_buffer(iommu);
- iommu_set_exclusion_range(iommu);
- iommu_init_msi(iommu);
- iommu_enable(iommu);
- iommu_flush_all_caches(iommu);
- }
-}
-
-static void disable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static void amd_iommu_resume(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_apply_resume_quirks(iommu);
-
- /* re-load the hardware */
- enable_iommus();
-
- /*
- * we have to flush after the IOMMUs are enabled because a
- * disabled IOMMU will never execute the commands we send
- */
- for_each_iommu(iommu)
- iommu_flush_all_caches(iommu);
-}
-
-static int amd_iommu_suspend(void)
-{
- /* disable IOMMUs to go out of the way for BIOS */
- disable_iommus();
-
- return 0;
-}
-
-static struct syscore_ops amd_iommu_syscore_ops = {
- .suspend = amd_iommu_suspend,
- .resume = amd_iommu_resume,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- * 1 pass) Find the highest PCI device id the driver has to handle.
- * Upon this information the size of the data structures is
- * determined that needs to be allocated.
- *
- * 2 pass) Initialize the data structures just allocated with the
- * information in the ACPI table about available AMD IOMMUs
- * in the system. It also maps the PCI devices in the
- * system to specific IOMMUs
- *
- * 3 pass) After the basic data structures are allocated and
- * initialized we update them with information about memory
- * remapping requirements parsed out of the ACPI table in
- * this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
- int i, ret = 0;
-
- /*
- * First parse ACPI tables to find the largest Bus/Dev/Func
- * we need to handle. Upon this information the shared data
- * structures for the IOMMUs in the system will be allocated
- */
- if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
- return -ENODEV;
-
- ret = amd_iommu_init_err;
- if (ret)
- goto out;
-
- dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
- alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
- rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
- ret = -ENOMEM;
-
- /* Device table - directly used by all IOMMUs */
- amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(dev_table_size));
- if (amd_iommu_dev_table == NULL)
- goto out;
-
- /*
- * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
- * IOMMU see for that device
- */
- amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
- get_order(alias_table_size));
- if (amd_iommu_alias_table == NULL)
- goto free;
-
- /* IOMMU rlookup table - find the IOMMU for a specific device */
- amd_iommu_rlookup_table = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_rlookup_table == NULL)
- goto free;
-
- amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(MAX_DOMAIN_ID/8));
- if (amd_iommu_pd_alloc_bitmap == NULL)
- goto free;
-
- /* init the device table */
- init_device_table();
-
- /*
- * let all alias entries point to itself
- */
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- amd_iommu_alias_table[i] = i;
-
- /*
- * never allocate domain 0 because its used as the non-allocated and
- * error value placeholder
- */
- amd_iommu_pd_alloc_bitmap[0] = 1;
-
- spin_lock_init(&amd_iommu_pd_lock);
-
- /*
- * now the data structures are allocated and basically initialized
- * start the real acpi table scan
- */
- ret = -ENODEV;
- if (acpi_table_parse("IVRS", init_iommu_all) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- ret = amd_iommu_init_devices();
- if (ret)
- goto free;
-
- enable_iommus();
-
- if (iommu_pass_through)
- ret = amd_iommu_init_passthrough();
- else
- ret = amd_iommu_init_dma_ops();
-
- if (ret)
- goto free_disable;
-
- amd_iommu_init_api();
-
- amd_iommu_init_notifier();
-
- register_syscore_ops(&amd_iommu_syscore_ops);
-
- if (iommu_pass_through)
- goto out;
-
- if (amd_iommu_unmap_flush)
- printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
- else
- printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-
- x86_platform.iommu_shutdown = disable_iommus;
-out:
- return ret;
-
-free_disable:
- disable_iommus();
-
-free:
- amd_iommu_uninit_devices();
-
- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
- get_order(MAX_DOMAIN_ID/8));
-
- free_pages((unsigned long)amd_iommu_rlookup_table,
- get_order(rlookup_table_size));
-
- free_pages((unsigned long)amd_iommu_alias_table,
- get_order(alias_table_size));
-
- free_pages((unsigned long)amd_iommu_dev_table,
- get_order(dev_table_size));
-
- free_iommu_all();
-
- free_unity_maps();
-
-#ifdef CONFIG_GART_IOMMU
- /*
- * We failed to initialize the AMD IOMMU - try fallback to GART
- * if possible.
- */
- gart_iommu_init();
-
-#endif
-
- goto out;
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
- return 0;
-}
-
-int __init amd_iommu_detect(void)
-{
- if (no_iommu || (iommu_detected && !gart_iommu_aperture))
- return -ENODEV;
-
- if (amd_iommu_disabled)
- return -ENODEV;
-
- if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
- iommu_detected = 1;
- amd_iommu_detected = 1;
- x86_init.iommu.iommu_init = amd_iommu_init;
-
- /* Make sure ACS will be enabled */
- pci_request_acs();
- return 1;
- }
- return -ENODEV;
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
- amd_iommu_dump = true;
-
- return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
- for (; *str; ++str) {
- if (strncmp(str, "fullflush", 9) == 0)
- amd_iommu_unmap_flush = true;
- if (strncmp(str, "off", 3) == 0)
- amd_iommu_disabled = true;
- }
-
- return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
-
-IOMMU_INIT_FINISH(amd_iommu_detect,
- gart_iommu_hole_init,
- 0,
- 0);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 289e92862fd..2b6630d75e1 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -44,6 +44,7 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
#include <asm/mrst.h>
+#include <asm/time.h>
#define APBT_MASK CLOCKSOURCE_MASK(32)
#define APBT_SHIFT 22
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b9338b8cf42..9498b844518 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/timex.h>
+#include <linux/i8253.h>
#include <linux/dmar.h>
#include <linux/init.h>
#include <linux/cpu.h>
@@ -39,7 +40,6 @@
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
-#include <asm/i8253.h>
#include <asm/i8259.h>
#include <asm/proto.h>
#include <asm/apic.h>
@@ -48,6 +48,7 @@
#include <asm/hpet.h>
#include <asm/idle.h>
#include <asm/mtrr.h>
+#include <asm/time.h>
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/tsc.h>
@@ -1429,7 +1430,7 @@ void enable_x2apic(void)
rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+ wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
}
}
#endif /* CONFIG_X86_X2APIC */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e5293394b54..8eb863e27ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1295,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
* irq handler will do the explicit EOI to the io-apic.
*/
ir_entry->vector = pin;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+ "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+ "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+ "Avail:%X Vector:%02X Dest:%08X "
+ "SID:%04X SQ:%X SVT:%X)\n",
+ apic_id, irte.present, irte.fpd, irte.dst_mode,
+ irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+ irte.avail, irte.vector, irte.dest_id,
+ irte.sid, irte.sq, irte.svt);
} else {
entry->delivery_mode = apic->irq_delivery_mode;
entry->dest_mode = apic->irq_dest_mode;
@@ -1337,9 +1347,9 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i)\n",
+ "IRQ %d Mode:%i Active:%i Dest:%d)\n",
apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
- irq, trigger, polarity);
+ irq, trigger, polarity, dest);
if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
@@ -1522,10 +1532,12 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
+ printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
+ reg_01.bits.entries);
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+ printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
+ reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1550,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect:\n");
+ if (intr_remapping_enabled) {
+ printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+ " Pol Stat Indx2 Zero Vect:\n");
+ } else {
+ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+ " Stat Dmod Deli Vect:\n");
+ }
for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X ",
- i,
- entry.dest
- );
+ if (intr_remapping_enabled) {
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry;
+
+ entry = ioapic_read_entry(apic, i);
+ ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+ printk(KERN_DEBUG " %02x %04X ",
+ i,
+ ir_entry->index
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %X %02X\n",
+ ir_entry->format,
+ ir_entry->mask,
+ ir_entry->trigger,
+ ir_entry->irr,
+ ir_entry->polarity,
+ ir_entry->delivery_status,
+ ir_entry->index2,
+ ir_entry->zero,
+ ir_entry->vector
+ );
+ } else {
+ struct IO_APIC_route_entry entry;
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
+ entry = ioapic_read_entry(apic, i);
+ printk(KERN_DEBUG " %02x %02X ",
+ i,
+ entry.dest
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector
+ );
+ }
}
}
+
printk(KERN_DEBUG "IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
@@ -1792,7 +1833,7 @@ __apicdebuginit(int) print_ICs(void)
return 0;
}
-fs_initcall(print_ICs);
+late_initcall(print_ICs);
/* Where if anywhere is the i8259 connect in external int mode */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 965a7666c28..0371c484bb8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,11 +229,11 @@
#include <linux/jiffies.h>
#include <linux/acpi.h>
#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
-#include <asm/i8253.h>
#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
@@ -1220,11 +1220,11 @@ static void reinit_timer(void)
raw_spin_lock_irqsave(&i8253_lock, flags);
/* set the clock to HZ */
- outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
- outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
udelay(10);
- outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
udelay(10);
raw_spin_unlock_irqrestore(&i8253_lock, flags);
#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6f..395a10e6806 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
- OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1edf5ba4fb2..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
+
+ /*
+ * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
+ * x86_energy_perf_policy(8) is available to change it at run-time
+ */
+ if (cpu_has(c, X86_FEATURE_EPB)) {
+ u64 epb;
+
+ rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
+ printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
+ " Set to 'normal', was 'performance'\n"
+ "ENERGY_PERF_BIAS: View and update with"
+ " x86_energy_perf_policy(8)\n");
+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+ wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ }
+ }
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
unsigned char covered;
char *msg;
} severities[] = {
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
-#define MCGMASK(x, res, s, m, r...) \
- { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
-#define MASK(x, y, s, m, r...) \
- { .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCACOD 0xffff
- BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
- BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
- BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
/* When MCIP is not set something is very confused */
- MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
/* Neither return not error IP -- no chance to recover -> PANIC */
- MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
- "Neither restart nor error IP"),
- MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
- KERNEL),
- BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
- MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
- "Spurious not enabled", SER),
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
/* ignore OVER for UCNA */
- MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
- "Uncorrected no action required", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
- "Illegal combination (UCNA with AR=1)", SER),
- MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+ MCESEV(
+ KEEP, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signalled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
/* AR add known MCACODs here */
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
- "Action required with lost events", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
- "Action required; unknown MCACOD", SER),
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
/* known AO MCACODs: */
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
- "Action optional: memory scrubbing error", SER),
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
- "Action optional: last level cache writeback error", SER),
-
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
- "Action optional unknown MCACOD", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
- "Action optional with lost events", SER),
- BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
- BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
- BITSET(0, SOME, "No match") /* always matches. keep at end */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+ ),
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
};
/*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
return IN_KERNEL;
}
-int mce_severity(struct mce *a, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg)
{
- enum context ctx = error_context(a);
+ enum context ctx = error_context(m);
struct severity *s;
for (s = severities;; s++) {
- if ((a->status & s->mask) != s->result)
+ if ((m->status & s->mask) != s->result)
continue;
- if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
continue;
if (s->ser == SER_REQUIRED && !mce_ser)
continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
static int __init severities_debugfs_init(void)
{
- struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+ struct dentry *dmce, *fsev;
dmce = mce_get_debugfs_dir();
- if (dmce == NULL)
+ if (!dmce)
goto err_out;
- fseverities_coverage = debugfs_create_file("severities-coverage",
- 0444, dmce, NULL,
- &severities_coverage_fops);
- if (fseverities_coverage == NULL)
+
+ fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ if (!fsev)
goto err_out;
return 0;
@@ -214,4 +258,4 @@ err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
-#endif
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b6464..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
-#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
#include <linux/mm.h>
#include <linux/debugfs.h>
#include <linux/edac_mce.h>
+#include <linux/irq_work.h>
#include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include "mce-internal.h"
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define rcu_dereference_check_mce(p) \
rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_read_mutex))
+ lockdep_is_held(&mce_chrdev_read_mutex))
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long mce_need_notify;
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
@@ -373,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
}
/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+ mce_setup(m);
+
+ m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (rip_msr)
+ m->ip = mce_rdmsrl(rip_msr);
+ }
+}
+
+/*
* Simple lockless ring to communicate PFNs from the exception handler with the
* process context work function. This is vastly simplified because there's
* only a single reader and a single writer.
@@ -443,40 +465,13 @@ static void mce_schedule_work(void)
}
}
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-
- if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
-}
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void mce_irq_work_cb(struct irq_work *entry)
{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
mce_notify_irq();
mce_schedule_work();
- irq_exit();
}
-#endif
static void mce_report_event(struct pt_regs *regs)
{
@@ -492,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
-
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
-
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
+ irq_work_queue(&__get_cpu_var(mce_irq_work));
}
DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -541,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
percpu_inc(mce_poll_count);
- mce_setup(&m);
+ mce_gather_info(&m, NULL);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
for (i = 0; i < banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -879,9 +851,9 @@ static int mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
return 0;
- if ((m->misc & 0x3f) > PAGE_SHIFT)
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
- if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
return 1;
}
@@ -942,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (!banks)
goto out;
- mce_setup(&m);
+ mce_gather_info(&m, regs);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
final = &__get_cpu_var(mces_seen);
*final = m;
@@ -1028,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
- mce_get_rip(&m, regs);
mce_log(&m);
if (severity > worst) {
@@ -1190,7 +1160,8 @@ int mce_notify_irq(void)
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &mce_need_notify)) {
- wake_up_interruptible(&mce_wait);
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
/*
* There is no risk of missing notifications because
@@ -1363,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
return 0;
}
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
- return;
+ return 0;
+
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
+ return 1;
break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
+ return 1;
break;
}
+
+ return 0;
}
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1428,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
if (mce_disabled)
return;
- __mcheck_cpu_ancient_init(c);
+ if (__mcheck_cpu_ancient_init(c))
+ return;
if (!mce_available(c))
return;
@@ -1444,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-
+ init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
}
/*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
*/
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- open_count--;
- open_exclu = 0;
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return 0;
}
@@ -1530,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
return 0;
}
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned long *cpu_tsc;
@@ -1542,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
if (!cpu_tsc)
return -ENOMEM;
- mutex_lock(&mce_read_mutex);
+ mutex_lock(&mce_chrdev_read_mutex);
if (!mce_apei_read_done) {
err = __mce_read_apei(&buf, usize);
@@ -1562,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
do {
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
- while (!mcelog.entry[i].finished) {
+ while (!m->finished) {
if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
+ memset(m, 0, sizeof(*m));
goto timeout;
}
cpu_relax();
}
smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
timeout:
;
}
@@ -1594,13 +1571,13 @@ timeout:
on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
}
}
@@ -1608,15 +1585,15 @@ timeout:
err = -EFAULT;
out:
- mutex_unlock(&mce_read_mutex);
+ mutex_unlock(&mce_chrdev_read_mutex);
kfree(cpu_tsc);
return err ? err : buf - ubuf;
}
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
- poll_wait(file, &mce_wait, wait);
+ poll_wait(file, &mce_chrdev_wait, wait);
if (rcu_access_index(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
@@ -1624,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
return 0;
}
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
{
int __user *p = (int __user *)arg;
@@ -1652,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
/* Modified in mce-inject.c, so not static or const */
struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
- .llseek = no_llseek,
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
};
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
@@ -1719,7 +1697,7 @@ int __init mcheck_init(void)
}
/*
- * Sysfs support
+ * mce_syscore: PM support
*/
/*
@@ -1739,12 +1717,12 @@ static int mce_disable_error_reporting(void)
return 0;
}
-static int mce_suspend(void)
+static int mce_syscore_suspend(void)
{
return mce_disable_error_reporting();
}
-static void mce_shutdown(void)
+static void mce_syscore_shutdown(void)
{
mce_disable_error_reporting();
}
@@ -1754,18 +1732,22 @@ static void mce_shutdown(void)
* Only one CPU is active at this time, the others get re-added later using
* CPU hotplug:
*/
-static void mce_resume(void)
+static void mce_syscore_resume(void)
{
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
}
static struct syscore_ops mce_syscore_ops = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
};
+/*
+ * mce_sysdev: Sysfs support
+ */
+
static void mce_cpu_restart(void *data)
{
del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1801,11 +1783,11 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct sysdev_class mce_sysclass = {
+static struct sysdev_class mce_sysdev_class = {
.name = "machinecheck",
};
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct sys_device, mce_sysdev);
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1934,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
&mce_cmci_disabled
};
-static struct sysdev_attribute *mce_attrs[] = {
+static struct sysdev_attribute *mce_sysdev_attrs[] = {
&attr_tolerant.attr,
&attr_check_interval.attr,
&attr_trigger,
@@ -1945,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
NULL
};
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_sysdev_initialized;
/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+static __cpuinit int mce_sysdev_create(unsigned int cpu)
{
+ struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
- memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(mce_dev, cpu).id = cpu;
- per_cpu(mce_dev, cpu).cls = &mce_sysclass;
+ memset(&sysdev->kobj, 0, sizeof(struct kobject));
+ sysdev->id = cpu;
+ sysdev->cls = &mce_sysdev_class;
- err = sysdev_register(&per_cpu(mce_dev, cpu));
+ err = sysdev_register(sysdev);
if (err)
return err;
- for (i = 0; mce_attrs[i]; i++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_sysdev_attrs[i]; i++) {
+ err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
if (err)
goto error;
}
for (j = 0; j < banks; j++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu),
- &mce_banks[j].attr);
+ err = sysdev_create_file(sysdev, &mce_banks[j].attr);
if (err)
goto error2;
}
- cpumask_set_cpu(cpu, mce_dev_initialized);
+ cpumask_set_cpu(cpu, mce_sysdev_initialized);
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+ sysdev_remove_file(sysdev, &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
+ sysdev_unregister(sysdev);
return err;
}
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_sysdev_remove(unsigned int cpu)
{
+ struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
int i;
- if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+ if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
return;
- for (i = 0; mce_attrs[i]; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_sysdev_attrs[i]; i++)
+ sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+ sysdev_remove_file(sysdev, &mce_banks[i].attr);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
- cpumask_clear_cpu(cpu, mce_dev_initialized);
+ sysdev_unregister(sysdev);
+ cpumask_clear_cpu(cpu, mce_sysdev_initialized);
}
/* Make sure there are no machine checks on offlined CPUs. */
@@ -2054,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
+ mce_sysdev_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
@@ -2062,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
+ mce_sysdev_remove(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -2116,27 +2099,28 @@ static __init int mcheck_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
- zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+ zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
mce_init_banks();
- err = sysdev_class_register(&mce_sysclass);
+ err = sysdev_class_register(&mce_sysdev_class);
if (err)
return err;
for_each_online_cpu(i) {
- err = mce_create_device(i);
+ err = mce_sysdev_create(i);
if (err)
return err;
}
register_syscore_ops(&mce_syscore_ops);
register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
+
+ /* register character device /dev/mcelog */
+ misc_register(&mce_chrdev_device);
return err;
}
-
device_initcall(mcheck_init_device);
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad3514..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
+ err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
b->kobj, name);
if (err)
goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
if (!b->kobj)
goto out_free;
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
+ err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
b->kobj, name);
if (err)
goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..08119a37e53 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
static int have_wrcomb(void)
{
struct pci_dev *dev;
- u8 rev;
dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
* chipsets to be tagged
*/
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
- dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev <= 5) {
- pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
+ dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+ dev->revision <= 5) {
+ pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
}
/*
* Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,55 +134,43 @@ static void __init init_table(void)
}
struct set_mtrr_data {
- atomic_t count;
- atomic_t gate;
unsigned long smp_base;
unsigned long smp_size;
unsigned int smp_reg;
mtrr_type smp_type;
};
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
-
/**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
{
#ifdef CONFIG_SMP
struct set_mtrr_data *data = info;
- unsigned long flags;
-
- atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
- cpu_relax();
-
- local_irq_save(flags);
-
- atomic_dec(&data->count);
- while (atomic_read(&data->gate))
- cpu_relax();
- /* The master has cleared me to execute */
+ /*
+ * We use this same function to initialize the mtrrs during boot,
+ * resume, runtime cpu online and on an explicit request to set a
+ * specific MTRR.
+ *
+ * During boot or suspend, the state of the boot cpu's mtrrs has been
+ * saved, and we want to replicate that across all the cpus that come
+ * online (either at the end of boot or resume or during a runtime cpu
+ * online). If we're doing that, @reg is set to something special and on
+ * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+ * started the boot/resume sequence, this might be a duplicate
+ * set_all()).
+ */
if (data->smp_reg != ~0U) {
mtrr_if->set(data->smp_reg, data->smp_base,
data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init) {
- /*
- * Initialize the MTRRs inaddition to the synchronisation.
- */
+ } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
mtrr_if->set_all();
}
-
- atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
- cpu_relax();
-
- atomic_dec(&data->count);
- local_irq_restore(flags);
#endif
return 0;
}
@@ -223,20 +208,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* 14. Wait for buddies to catch up
* 15. Enable interrupts.
*
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU announces that it started the rendezvous handler by
- * decrementing the count, We reset data.count and set the data.gate flag
- * allowing all the cpu's to proceed with the work. As each cpu disables
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
*
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
@@ -244,92 +220,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
static void
set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
- struct set_mtrr_data data;
- unsigned long flags;
- int cpu;
-
- preempt_disable();
-
- data.smp_reg = reg;
- data.smp_base = base;
- data.smp_size = size;
- data.smp_type = type;
- atomic_set(&data.count, num_booting_cpus() - 1);
-
- /* Make sure data.count is visible before unleashing other CPUs */
- smp_wmb();
- atomic_set(&data.gate, 0);
-
- /* Start the ball rolling on other CPUs */
- for_each_online_cpu(cpu) {
- struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-
- if (cpu == smp_processor_id())
- continue;
-
- stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
- }
-
-
- while (atomic_read(&data.count))
- cpu_relax();
-
- /* Ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 1);
-
- local_irq_save(flags);
-
- while (atomic_read(&data.count))
- cpu_relax();
-
- /* Ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 0);
-
- /* Do our MTRR business */
-
- /*
- * HACK!
- *
- * We use this same function to initialize the mtrrs during boot,
- * resume, runtime cpu online and on an explicit request to set a
- * specific MTRR.
- *
- * During boot or suspend, the state of the boot cpu's mtrrs has been
- * saved, and we want to replicate that across all the cpus that come
- * online (either at the end of boot or resume or during a runtime cpu
- * online). If we're doing that, @reg is set to something special and on
- * this cpu we still do mtrr_if->set_all(). During boot/resume, this
- * is unnecessary if at this point we are still on the cpu that started
- * the boot/resume sequence. But there is no guarantee that we are still
- * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
- * sure that we are in sync with everyone else.
- */
- if (reg != ~0U)
- mtrr_if->set(reg, base, size, type);
- else
- mtrr_if->set_all();
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
- /* Wait for the others */
- while (atomic_read(&data.count))
- cpu_relax();
-
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 1);
-
- /*
- * Wait here for everyone to have seen the gate change
- * So we're the last ones to touch 'data'
- */
- while (atomic_read(&data.count))
- cpu_relax();
+ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
- local_irq_restore(flags);
- preempt_enable();
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+ cpu_callout_mask);
}
/**
@@ -783,7 +693,7 @@ void mtrr_ap_init(void)
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
* lock to prevent mtrr entry changes
*/
- set_mtrr(~0U, 0, 0, 0);
+ set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
}
/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b4b17..4ee3abf20ed 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
-#include <linux/highmem.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
@@ -45,38 +44,27 @@ do { \
#endif
/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ * | NHM/WSM | SNB |
+ * register -------------------------------
+ * | HT | no HT | HT | no HT |
+ *-----------------------------------------
+ * offcore | core | core | cpu | core |
+ * lbr_sel | core | core | cpu | core |
+ * ld_lat | cpu | core | cpu | core |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
*/
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
- unsigned long offset, addr = (unsigned long)from;
- unsigned long size, len = 0;
- struct page *page;
- void *map;
- int ret;
-
- do {
- ret = __get_user_pages_fast(addr, 1, 0, &page);
- if (!ret)
- break;
-
- offset = addr & (PAGE_SIZE - 1);
- size = min(PAGE_SIZE - offset, n - len);
-
- map = kmap_atomic(page);
- memcpy(to, map+offset, size);
- kunmap_atomic(map);
- put_page(page);
+enum extra_reg_type {
+ EXTRA_REG_NONE = -1, /* not used */
- len += size;
- to += size;
- addr += size;
+ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
+ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
- } while (len < n);
-
- return len;
-}
+ EXTRA_REG_MAX /* number of entries needed */
+};
struct event_constraint {
union {
@@ -132,11 +120,10 @@ struct cpu_hw_events {
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
/*
- * Intel percore register state.
- * Coordinate shared resources between HT threads.
+ * manage shared (per-core, per-cpu) registers
+ * used on Intel NHM/WSM/SNB
*/
- int percore_used; /* Used by this CPU? */
- struct intel_percore *per_core;
+ struct intel_shared_regs *shared_regs;
/*
* AMD specific bits
@@ -187,26 +174,45 @@ struct cpu_hw_events {
for ((e) = (c); (e)->weight; (e)++)
/*
+ * Per register state.
+ */
+struct er_account {
+ raw_spinlock_t lock; /* per-core: protect structure */
+ u64 config; /* extra MSR config */
+ u64 reg; /* extra MSR number */
+ atomic_t ref; /* reference count */
+};
+
+/*
* Extra registers for specific events.
+ *
* Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
*/
struct extra_reg {
unsigned int event;
unsigned int msr;
u64 config_mask;
u64 valid_mask;
+ int idx; /* per_xxx->regs[] reg index */
};
-#define EVENT_EXTRA_REG(e, ms, m, vm) { \
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
.event = (e), \
.msr = (ms), \
.config_mask = (m), \
.valid_mask = (vm), \
+ .idx = EXTRA_REG_##i \
}
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
- EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
union perf_capabilities {
struct {
@@ -252,7 +258,6 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
- struct event_constraint *percore_constraints;
void (*quirks)(void);
int perfctr_second_write;
@@ -286,8 +291,12 @@ struct x86_pmu {
* Extra registers for events
*/
struct extra_reg *extra_regs;
+ unsigned int er_flags;
};
+#define ERF_NO_HT_SHARING 1
+#define ERF_HAS_RSP_1 2
+
static struct x86_pmu x86_pmu __read_mostly;
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -393,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
*/
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
+ struct hw_perf_event_extra *reg;
struct extra_reg *er;
- event->hw.extra_reg = 0;
- event->hw.extra_config = 0;
+ reg = &event->hw.extra_reg;
if (!x86_pmu.extra_regs)
return 0;
@@ -406,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
- event->hw.extra_reg = er->msr;
- event->hw.extra_config = event->attr.config1;
+
+ reg->idx = er->idx;
+ reg->config = event->attr.config1;
+ reg->reg = er->msr;
break;
}
return 0;
@@ -706,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;
+ /* mark unused */
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+
return x86_pmu.hw_config(event);
}
@@ -747,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
- if (hwc->extra_reg)
- wrmsrl(hwc->extra_reg, hwc->extra_config);
+ if (hwc->extra_reg.reg)
+ wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
wrmsrl(hwc->config_base, hwc->config | enable_mask);
}
@@ -1332,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1637,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
perf_pmu_enable(pmu);
return 0;
}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+ kfree(cpuc->shared_regs);
+ kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+ struct cpu_hw_events *cpuc;
+ int cpu = raw_smp_processor_id();
+
+ cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+ if (!cpuc)
+ return ERR_PTR(-ENOMEM);
+
+ /* only needed, if we have extra_regs */
+ if (x86_pmu.extra_regs) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ goto error;
+ }
+ return cpuc;
+error:
+ free_fake_cpuc(cpuc);
+ return ERR_PTR(-ENOMEM);
+}
/*
* validate that we can schedule this event
@@ -1647,9 +1695,9 @@ static int validate_event(struct perf_event *event)
struct event_constraint *c;
int ret = 0;
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- return -ENOMEM;
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
c = x86_pmu.get_event_constraints(fake_cpuc, event);
@@ -1659,7 +1707,7 @@ static int validate_event(struct perf_event *event)
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(fake_cpuc, event);
- kfree(fake_cpuc);
+ free_fake_cpuc(fake_cpuc);
return ret;
}
@@ -1679,36 +1727,32 @@ static int validate_group(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
struct cpu_hw_events *fake_cpuc;
- int ret, n;
-
- ret = -ENOMEM;
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- goto out;
+ int ret = -ENOSPC, n;
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
/*
* the event is not yet connected with its
* siblings therefore we must first collect
* existing siblings, then add the new event
* before we can simulate the scheduling
*/
- ret = -ENOSPC;
n = collect_events(fake_cpuc, leader, true);
if (n < 0)
- goto out_free;
+ goto out;
fake_cpuc->n_events = n;
n = collect_events(fake_cpuc, event, false);
if (n < 0)
- goto out_free;
+ goto out;
fake_cpuc->n_events = n;
ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-out_free:
- kfree(fake_cpuc);
out:
+ free_fake_cpuc(fake_cpuc);
return ret;
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+ [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
};
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c4..45fbb8f7f54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
#ifdef CONFIG_CPU_SUP_INTEL
-#define MAX_EXTRA_REGS 2
-
-/*
- * Per register state.
- */
-struct er_account {
- int ref; /* reference count */
- unsigned int extra_reg; /* extra MSR number */
- u64 extra_config; /* extra MSR config */
-};
-
/*
- * Per core state
- * This used to coordinate shared registers for HT threads.
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
*/
-struct intel_percore {
- raw_spinlock_t lock; /* protect structure */
- struct er_account regs[MAX_EXTRA_REGS];
- int refcnt; /* number of threads */
- unsigned core_id;
+struct intel_shared_regs {
+ struct er_account regs[EXTRA_REG_MAX];
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
};
/*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
EVENT_EXTRA_END
};
-static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
-{
- INTEL_EVENT_CONSTRAINT(0xb7, 0),
- EVENT_CONSTRAINT_END
-};
-
static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -116,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
- INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
EVENT_CONSTRAINT_END
@@ -125,15 +107,13 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
EVENT_EXTRA_END
};
-static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
{
- INTEL_EVENT_CONSTRAINT(0xb7, 0),
- INTEL_EVENT_CONSTRAINT(0xbb, 0),
EVENT_CONSTRAINT_END
};
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+
};
static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
};
/*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
},
- }
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+ },
+ },
};
static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
};
static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
data.period = event->hw.last_period;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
+static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+{
+ if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+ return false;
+
+ if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01bb;
+ event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+ } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01b7;
+ event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ }
+
+ if (event->hw.extra_reg.idx == orig_idx)
+ return false;
+
+ return true;
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
{
- struct hw_perf_event *hwc = &event->hw;
- unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
- struct event_constraint *c;
- struct intel_percore *pc;
+ struct event_constraint *c = &emptyconstraint;
+ struct hw_perf_event_extra *reg = &event->hw.extra_reg;
struct er_account *era;
- int i;
- int free_slot;
- int found;
+ unsigned long flags;
+ int orig_idx = reg->idx;
- if (!x86_pmu.percore_constraints || hwc->extra_alloc)
- return NULL;
+ /* already allocated shared msr */
+ if (reg->alloc)
+ return &unconstrained;
- for (c = x86_pmu.percore_constraints; c->cmask; c++) {
- if (e != c->code)
- continue;
+again:
+ era = &cpuc->shared_regs->regs[reg->idx];
+ /*
+ * we use spin_lock_irqsave() to avoid lockdep issues when
+ * passing a fake cpuc
+ */
+ raw_spin_lock_irqsave(&era->lock, flags);
+
+ if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+ /* lock in msr value */
+ era->config = reg->config;
+ era->reg = reg->reg;
+
+ /* one more user */
+ atomic_inc(&era->ref);
+
+ /* no need to reallocate during incremental event scheduling */
+ reg->alloc = 1;
/*
- * Allocate resource per core.
+ * All events using extra_reg are unconstrained.
+ * Avoids calling x86_get_event_constraints()
+ *
+ * Must revisit if extra_reg controlling events
+ * ever have constraints. Worst case we go through
+ * the regular event constraint table.
*/
- pc = cpuc->per_core;
- if (!pc)
- break;
- c = &emptyconstraint;
- raw_spin_lock(&pc->lock);
- free_slot = -1;
- found = 0;
- for (i = 0; i < MAX_EXTRA_REGS; i++) {
- era = &pc->regs[i];
- if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
- /* Allow sharing same config */
- if (hwc->extra_config == era->extra_config) {
- era->ref++;
- cpuc->percore_used = 1;
- hwc->extra_alloc = 1;
- c = NULL;
- }
- /* else conflict */
- found = 1;
- break;
- } else if (era->ref == 0 && free_slot == -1)
- free_slot = i;
- }
- if (!found && free_slot != -1) {
- era = &pc->regs[free_slot];
- era->ref = 1;
- era->extra_reg = hwc->extra_reg;
- era->extra_config = hwc->extra_config;
- cpuc->percore_used = 1;
- hwc->extra_alloc = 1;
- c = NULL;
- }
- raw_spin_unlock(&pc->lock);
- return c;
+ c = &unconstrained;
+ } else if (intel_try_alt_er(event, orig_idx)) {
+ raw_spin_unlock(&era->lock);
+ goto again;
}
+ raw_spin_unlock_irqrestore(&era->lock, flags);
- return NULL;
+ return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+ struct hw_perf_event_extra *reg)
+{
+ struct er_account *era;
+
+ /*
+ * only put constraint if extra reg was actually
+ * allocated. Also takes care of event which do
+ * not use an extra shared reg
+ */
+ if (!reg->alloc)
+ return;
+
+ era = &cpuc->shared_regs->regs[reg->idx];
+
+ /* one fewer user */
+ atomic_dec(&era->ref);
+
+ /* allocate again next time */
+ reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct event_constraint *c = NULL;
+
+ if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
+ c = __intel_shared_reg_get_constraints(cpuc, event);
+
+ return c;
}
static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;
- c = intel_percore_constraints(cpuc, event);
+ c = intel_shared_regs_constraints(cpuc, event);
if (c)
return c;
return x86_get_event_constraints(cpuc, event);
}
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
- struct extra_reg *er;
- struct intel_percore *pc;
- struct er_account *era;
- struct hw_perf_event *hwc = &event->hw;
- int i, allref;
+ struct hw_perf_event_extra *reg;
- if (!cpuc->percore_used)
- return;
-
- for (er = x86_pmu.extra_regs; er->msr; er++) {
- if (er->event != (hwc->config & er->config_mask))
- continue;
+ reg = &event->hw.extra_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
+}
- pc = cpuc->per_core;
- raw_spin_lock(&pc->lock);
- for (i = 0; i < MAX_EXTRA_REGS; i++) {
- era = &pc->regs[i];
- if (era->ref > 0 &&
- era->extra_config == hwc->extra_config &&
- era->extra_reg == er->msr) {
- era->ref--;
- hwc->extra_alloc = 0;
- break;
- }
- }
- allref = 0;
- for (i = 0; i < MAX_EXTRA_REGS; i++)
- allref += pc->regs[i].ref;
- if (allref == 0)
- cpuc->percore_used = 0;
- raw_spin_unlock(&pc->lock);
- break;
- }
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ intel_put_shared_regs_event_constraints(cpuc, event);
}
static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
.event_constraints = intel_core_event_constraints,
};
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ struct intel_shared_regs *regs;
+ int i;
+
+ regs = kzalloc_node(sizeof(struct intel_shared_regs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (regs) {
+ /*
+ * initialize the locks to keep lockdep happy
+ */
+ for (i = 0; i < EXTRA_REG_MAX; i++)
+ raw_spin_lock_init(&regs->regs[i].lock);
+
+ regs->core_id = -1;
+ }
+ return regs;
+}
+
static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!cpu_has_ht_siblings())
+ if (!x86_pmu.extra_regs)
return NOTIFY_OK;
- cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!cpuc->per_core)
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
return NOTIFY_BAD;
- raw_spin_lock_init(&cpuc->per_core->lock);
- cpuc->per_core->core_id = -1;
return NOTIFY_OK;
}
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
*/
intel_pmu_lbr_reset();
- if (!cpu_has_ht_siblings())
+ if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
return;
for_each_cpu(i, topology_thread_cpumask(cpu)) {
- struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+ struct intel_shared_regs *pc;
+ pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- kfree(cpuc->per_core);
- cpuc->per_core = pc;
+ kfree(cpuc->shared_regs);
+ cpuc->shared_regs = pc;
break;
}
}
- cpuc->per_core->core_id = core_id;
- cpuc->per_core->refcnt++;
+ cpuc->shared_regs->core_id = core_id;
+ cpuc->shared_regs->refcnt++;
}
static void intel_pmu_cpu_dying(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- struct intel_percore *pc = cpuc->per_core;
+ struct intel_shared_regs *pc;
+ pc = cpuc->shared_regs;
if (pc) {
if (pc->core_id == -1 || --pc->refcnt == 0)
kfree(pc);
- cpuc->per_core = NULL;
+ cpuc->shared_regs = NULL;
}
fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1532,6 @@ static __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_nehalem_event_constraints;
x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
- x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.extra_regs = intel_nehalem_extra_regs;
@@ -1481,10 +1576,10 @@ static __init int intel_pmu_init(void)
intel_pmu_lbr_init_nhm();
x86_pmu.event_constraints = intel_westmere_event_constraints;
- x86_pmu.percore_constraints = intel_westmere_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
x86_pmu.extra_regs = intel_westmere_extra_regs;
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
/* UOPS_ISSUED.STALLED_CYCLES */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1502,6 +1597,10 @@ static __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_events;
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1512,11 +1611,19 @@ static __init int intel_pmu_init(void)
break;
default:
- /*
- * default constraints for v2 and up
- */
- x86_pmu.event_constraints = intel_gen_event_constraints;
- pr_cont("generic architected perfmon, ");
+ switch (x86_pmu.version) {
+ case 1:
+ x86_pmu.event_constraints = intel_v1_event_constraints;
+ pr_cont("generic architected perfmon v1, ");
+ break;
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ break;
+ }
}
return 0;
}
@@ -1528,4 +1635,8 @@ static int intel_pmu_init(void)
return 0;
}
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ return NULL;
+}
#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..1b1ef3addcf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
*/
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+ if (perf_output_begin(&handle, event, header.size * (top - at)))
return 1;
for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
else
regs.flags &= ~PERF_EFLAGS_EXACT;
- if (perf_event_overflow(event, 1, &data, &regs))
+ if (perf_event_overflow(event, &data, &regs))
x86_pmu_stop(event, 0);
}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ead584fb6a7..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
};
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+ u64 original;
+ u64 alternative;
+} p4_event_aliases[] = {
+ {
+ /*
+ * Non-halted cycles can be substituted with non-sleeping cycles (see
+ * Intel SDM Vol3b for details). We need this alias to be able
+ * to run nmi-watchdog and 'perf top' (or any other user space tool
+ * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+ * simultaneously.
+ */
+ .original =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ .alternative =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE),
+ },
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+ u64 config_match;
+ int i;
+
+ /*
+ * Only event with special mark is allowed,
+ * we're to be sure it didn't come as malformed
+ * RAW event.
+ */
+ if (!(config & P4_CONFIG_ALIASABLE))
+ return 0;
+
+ config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+ if (config_match == p4_event_aliases[i].original) {
+ config_match = p4_event_aliases[i].alternative;
+ break;
+ } else if (config_match == p4_event_aliases[i].alternative) {
+ config_match = p4_event_aliases[i].original;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(p4_event_aliases))
+ return 0;
+
+ return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
/* non-halted CPU clocks */
[PERF_COUNT_HW_CPU_CYCLES] =
p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
+ P4_CONFIG_ALIASABLE,
/*
* retired instructions
@@ -945,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1120,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
struct p4_event_bind *bind;
unsigned int i, thread, num;
int cntr_idx, escr_idx;
+ u64 config_alias;
+ int pass;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1128,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
hwc = &cpuc->event_list[i]->hw;
thread = p4_ht_thread(cpu);
+ pass = 0;
+
+again:
+ /*
+ * It's possible to hit a circular lock
+ * between original and alternative events
+ * if both are scheduled already.
+ */
+ if (pass > 2)
+ goto done;
+
bind = p4_config_get_bind(hwc->config);
escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
if (unlikely(escr_idx == -1))
@@ -1141,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
}
cntr_idx = p4_next_cntr(thread, used_mask, bind);
- if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
- goto done;
+ if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+ /*
+ * Check whether an event alias is still available.
+ */
+ config_alias = p4_get_alias_event(hwc->config);
+ if (!config_alias)
+ goto done;
+ hwc->config = config_alias;
+ pass++;
+ goto again;
+ }
p4_pmu_swap_config_ts(hwc, cpu);
if (assign)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 9aeb78a23de..a621f342768 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -134,6 +134,24 @@ static int __init add_bus_probe(void)
module_init(add_bus_probe);
#ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
static int x86_of_pci_irq_enable(struct pci_dev *dev)
{
struct of_irq oirq;
@@ -165,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
void __cpuinit x86_of_pci_init(void)
{
- struct device_node *np;
-
pcibios_enable_irq = x86_of_pci_irq_enable;
pcibios_disable_irq = x86_of_pci_irq_disable;
-
- for_each_node_by_type(np, "pci") {
- const void *prop;
- struct pci_bus *bus;
- unsigned int bus_min;
- struct device_node *child;
-
- prop = of_get_property(np, "bus-range", NULL);
- if (!prop)
- continue;
- bus_min = be32_to_cpup(prop);
-
- bus = pci_find_bus(0, bus_min);
- if (!bus) {
- printk(KERN_ERR "Can't find a node for bus %s.\n",
- np->full_name);
- continue;
- }
-
- if (bus->self)
- bus->self->dev.of_node = np;
- else
- bus->dev.of_node = np;
-
- for_each_child_of_node(np, child) {
- struct pci_dev *dev;
- u32 devfn;
-
- prop = of_get_property(child, "reg", NULL);
- if (!prop)
- continue;
-
- devfn = (be32_to_cpup(prop) >> 8) & 0xff;
- dev = pci_get_slot(bus, devfn);
- if (!dev)
- continue;
- dev->dev.of_node = child;
- pci_dev_put(dev);
- }
- }
}
#endif
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d..19853ad8afc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
}
/*
- * We are returning from the irq stack and go to the previous one.
- * If the previous stack is also in the irq stack, then bp in the first
- * frame of the irq stack points to the previous, interrupted one.
- * Otherwise we have another level of indirection: We first save
- * the bp of the previous stack, then we switch the stack to the irq one
- * and save a new bp that links to the previous one.
- * (See save_args())
- */
-static inline unsigned long
-fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
- unsigned long *irq_stack, unsigned long *irq_stack_end)
-{
-#ifdef CONFIG_FRAME_POINTER
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long next;
-
- if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
- if (!probe_kernel_address(&frame->next_frame, next))
- return next;
- else
- WARN_ONCE(1, "Perf: bad frame pointer = %p in "
- "callchain\n", &frame->next_frame);
- }
-#endif
- return bp;
-}
-
-/*
* x86-64 can have up to three kernel stacks:
* process stack
* interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
task = current;
if (!stack) {
- stack = &dummy;
- if (task && task != current)
+ if (regs)
+ stack = (unsigned long *)regs->sp;
+ else if (task && task != current)
stack = (unsigned long *)task->thread.sp;
+ else
+ stack = &dummy;
}
if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
* pointer (index -1 to end) in the IRQ stack:
*/
stack = (unsigned long *) (irq_stack_end[-1]);
- bp = fixup_bp_irq_link(bp, stack, irq_stack,
- irq_stack_end);
irq_stack_end = NULL;
ops->stack(data, "EOI");
continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0c989..37e895a1c74 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -297,27 +297,26 @@ ENDPROC(native_usergs_sysret64)
.endm
/* save partial stack frame */
- .pushsection .kprobes.text, "ax"
-ENTRY(save_args)
- XCPT_FRAME
+ .macro SAVE_ARGS_IRQ
cld
- /*
- * start from rbp in pt_regs and jump over
- * return address.
- */
- movq_cfi rdi, RDI+8-RBP
- movq_cfi rsi, RSI+8-RBP
- movq_cfi rdx, RDX+8-RBP
- movq_cfi rcx, RCX+8-RBP
- movq_cfi rax, RAX+8-RBP
- movq_cfi r8, R8+8-RBP
- movq_cfi r9, R9+8-RBP
- movq_cfi r10, R10+8-RBP
- movq_cfi r11, R11+8-RBP
-
- leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
- movq_cfi rbp, 8 /* push %rbp */
- leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, RDI-RBP
+ movq_cfi rsi, RSI-RBP
+ movq_cfi rdx, RDX-RBP
+ movq_cfi rcx, RCX-RBP
+ movq_cfi rax, RAX-RBP
+ movq_cfi r8, R8-RBP
+ movq_cfi r9, R9-RBP
+ movq_cfi r10, R10-RBP
+ movq_cfi r11, R11-RBP
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
testl $3, CS(%rdi)
je 1f
SWAPGS
@@ -329,19 +328,14 @@ ENTRY(save_args)
*/
1: incl PER_CPU_VAR(irq_count)
jne 2f
- popq_cfi %rax /* move return address... */
mov PER_CPU_VAR(irq_stack_ptr),%rsp
EMPTY_FRAME 0
- pushq_cfi %rbp /* backlink for unwinder */
- pushq_cfi %rax /* ... to the new stack */
- /*
- * We entered an interrupt context - irqs are off:
- */
-2: TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-END(save_args)
- .popsection
+
+2: /* Store previous stack value */
+ pushq %rsi
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+ .endm
ENTRY(save_rest)
PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +467,7 @@ ENTRY(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1
+ SAVE_ARGS 8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +502,7 @@ sysret_check:
TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 0,-ARG_SKIP,1
+ RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -791,7 +785,7 @@ END(interrupt)
/* reserve pt_regs for scratch regs and rbp */
subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
- call save_args
+ SAVE_ARGS_IRQ
PARTIAL_FRAME 0
call \func
.endm
@@ -814,15 +808,14 @@ ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
- leaveq
- CFI_RESTORE rbp
+ /* Restore saved previous stack */
+ popq %rsi
+ leaq 16(%rsi), %rsp
+
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
+ CFI_ADJUST_CFA_OFFSET -16
- /* we did not save rbx, restore only from ARGOFFSET */
- addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +851,7 @@ retint_restore_args: /* return to kernel space */
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 0,8,0
+ RESTORE_ARGS 1,8,1
irq_return:
INTERRUPT_RETURN
@@ -991,11 +984,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 6781765b3a0..0f4b0651cd3 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
+#include <linux/i8253.h>
#include <linux/slab.h>
#include <linux/hpet.h>
#include <linux/init.h>
@@ -12,8 +13,8 @@
#include <linux/io.h>
#include <asm/fixmap.h>
-#include <asm/i8253.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#define HPET_MASK CLOCKSOURCE_MASK(32)
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index fb66dc9e36c..f2b96de3c7c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,113 +3,24 @@
*
*/
#include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/timex.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/i8253.h>
-#include <asm/i8253.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#include <asm/smp.h>
-DEFINE_RAW_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
/*
* HPET replaces the PIT, when enabled. So we need to know, which of
* the two timers is used
*/
struct clock_event_device *global_clock_event;
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- /* binary, mode 2, LSB/MSB, ch 0 */
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
- outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
- break;
-
- case CLOCK_EVT_MODE_SHUTDOWN:
- case CLOCK_EVT_MODE_UNUSED:
- if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
- evt->mode == CLOCK_EVT_MODE_ONESHOT) {
- outb_pit(0x30, PIT_MODE);
- outb_pit(0, PIT_CH0);
- outb_pit(0, PIT_CH0);
- }
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- /* One shot setup */
- outb_pit(0x38, PIT_MODE);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
- raw_spin_unlock(&i8253_lock);
-}
-
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
- outb_pit(delta & 0xff , PIT_CH0); /* LSB */
- outb_pit(delta >> 8 , PIT_CH0); /* MSB */
- raw_spin_unlock(&i8253_lock);
-
- return 0;
-}
-
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
- .name = "pit",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = init_pit_timer,
- .set_next_event = pit_next_event,
- .irq = 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
void __init setup_pit_timer(void)
{
- /*
- * Start pit with the boot cpu mask and make it global after the
- * IO_APIC has been initialized.
- */
- pit_ce.cpumask = cpumask_of(smp_processor_id());
-
- clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
- global_clock_event = &pit_ce;
+ clockevent_i8253_init(true);
+ global_clock_event = &i8253_clockevent;
}
#ifndef CONFIG_X86_64
@@ -123,7 +34,7 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+ i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
return 0;
return clocksource_i8253_init();
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4ef993..f09d4bbe2d2 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -272,9 +272,6 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_X86_MCE_THRESHOLD
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
- alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b..00354d4919a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
return register_die_notifier(&kgdb_notifier);
}
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
struct perf_sample_data *data, struct pt_regs *regs)
{
struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
for (i = 0; i < HBP_NUM; i++) {
if (breakinfo[i].pev)
continue;
- breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
if (IS_ERR((void * __force)breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab1..591be0ee193 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
unsigned int mpb[0];
};
-#define UCODE_CONTAINER_SECTION_HDR 8
-#define UCODE_CONTAINER_HEADER_SIZE 12
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
static struct equiv_cpu_entry *equiv_cpu_table;
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
- unsigned int max_size, actual_size;
+ u32 max_size, actual_size;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
break;
}
- actual_size = buf[4] + (buf[5] << 8);
+ actual_size = *(u32 *)(buf + 4);
- if (actual_size > size || actual_size > max_size) {
+ if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
pr_err("section size mismatch\n");
return 0;
}
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
struct microcode_header_amd *mc = NULL;
unsigned int actual_size = 0;
- if (buf[0] != UCODE_UCODE_TYPE) {
+ if (*(u32 *)buf != UCODE_UCODE_TYPE) {
pr_err("invalid type field in container file section header\n");
goto out;
}
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
if (!mc)
goto out;
- get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
- *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
+ get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
+ *mc_size = actual_size + SECTION_HDR_SIZE;
out:
return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
return -ENOMEM;
}
- get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
+ get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
- return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+ /* add header length */
+ return size + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 807c2a2b80f..82528799c5d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
return ret;
}
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;
- bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+ bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
/*
* CHECKME: the previous code returned -EIO if the addr wasn't
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 4f0d46fefa7..9242436e993 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -419,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9fd3137230d..9f548cb4a95 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -438,7 +438,7 @@ static void impress_friends(void)
void __inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- char *names[] = { "ID", "VERSION", "SPIV" };
+ const char * const names[] = { "ID", "VERSION", "SPIV" };
int timeout;
u32 status;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55d9bc03f69..fdd0c6430e5 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
}
EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 30ac65df7d4..e07a2fc876b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
#include <asm/bootparam.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
#include <asm/fixmap.h>
#include <asm/proto.h>
#include <asm/setup.h>
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 00cbb272627..5a64d057be5 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,13 +11,13 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/mca.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/i8259.h>
-#include <asm/i8253.h>
#include <asm/timer.h>
#include <asm/hpet.h>
#include <asm/time.h>
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index db832fd65ec..13ee258442a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -71,7 +71,8 @@
#include <asm/stackprotector.h>
#include <asm/reboot.h> /* for struct machine_ops */
-/*G:010 Welcome to the Guest!
+/*G:010
+ * Welcome to the Guest!
*
* The Guest in our tale is a simple creature: identical to the Host but
* behaving in simplified but equivalent ways. In particular, the Guest is the
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
#endif
/*G:036
- * When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue the do-nothing hypercall to flush any stored calls.
-:*/
+ * When lazy mode is turned off, we issue the do-nothing hypercall to
+ * flush any stored calls, and call the generic helper to reset the
+ * per-cpu lazy mode variable.
+ */
static void lguest_leave_lazy_mmu_mode(void)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
paravirt_leave_lazy_mmu();
}
+/*
+ * We also catch the end of context switch; we enter lazy mode for much of
+ * that too, so again we need to flush here.
+ *
+ * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
+ * mode, but unlike Xen, lguest doesn't care about the difference).
+ */
static void lguest_end_context_switch(struct task_struct *next)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
* giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
*
* This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 5 languages. I am not making this up!
+ * has been translated into 6 languages. I am not making this up!
*
* We could get funky here and identify ourselves as "GenuineLguest", but
* instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
/*
* PAE systems can mark pages as non-executable. Linux calls this the
* NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch turn if off here, since we don't
+ * Virus Protection). We just switch it off here, since we don't
* support it.
*/
case 0x80000001:
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)
/* See lguest_set_pte() below. */
static bool cr3_changed = false;
+static unsigned long current_cr3;
/*
* cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes. The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
+ * cr0. Keep a local copy, and tell the Host when it changes.
*/
static void lguest_write_cr3(unsigned long cr3)
{
- lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
+ current_cr3 = cr3;
/* These two page tables are simple, linear, and used during boot */
if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)
static unsigned long lguest_read_cr3(void)
{
- return lguest_data.pgdir;
+ return current_cr3;
}
/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)
/*
* The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. Wetell the Host the toplevel and
+ * a page into a process' address space. We tell the Host the toplevel and
* address this corresponds to. The Guest uses one pagetable per process, so
* we need to tell the Host which one we're changing (mm->pgd).
*/
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr)
{
/* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+ lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
}
/*
@@ -1140,7 +1148,7 @@ static struct notifier_block paniced = {
static __init char *lguest_memory_setup(void)
{
/*
- *The Linux bootloader header contains an "e820" memory map: the
+ * The Linux bootloader header contains an "e820" memory map: the
* Launcher populated the first entry with our memory limit.
*/
e820_add_region(boot_params.e820_map[0].addr,
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d5..6ddfe4fc23c 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,18 +6,22 @@
#include <asm/processor-flags.h>
/*G:020
- * Our story starts with the kernel booting into startup_32 in
- * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
- * the bootloader (the Launcher in our case).
+
+ * Our story starts with the bzImage: booting starts at startup_32 in
+ * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
+ * kernel in place and then jumps into it: startup_32 in
+ * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
+ * register, which is created by the bootloader (the Launcher in our case).
*
* The startup_32 function does very little: it clears the uninitialized global
* C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe. Finally it checks the
- * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen:
- * if it's set to '1' (lguest's assigned number), then it calls us here.
+ * header and kernel command line somewhere safe, and populates some initial
+ * page tables. Finally it checks the 'hardware_subarch' field. This was
+ * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
+ * assigned number), then it calls us here.
*
* WARNING: be very careful here! We're running at addresses equal to physical
- * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * addresses (around 0), not above PAGE_OFFSET as most code expects
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data without remembering to subtract __PAGE_OFFSET!
*
@@ -27,13 +31,18 @@
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
/*
- * We make the "initialization" hypercall now to tell the Host about
- * us, and also find out where it put our page tables.
+ * We make the "initialization" hypercall now to tell the Host where
+ * our lguest_data struct is.
*/
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx
int $LGUEST_TRAP_ENTRY
+ /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
+ movl $LHCALL_NEW_PGTABLE, %eax
+ movl $(initial_page_table - __PAGE_OFFSET), %ebx
+ int $LGUEST_TRAP_ENTRY
+
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
@@ -96,12 +105,8 @@ send_interrupts:
*/
pushl %eax
movl $LHCALL_SEND_INTERRUPTS, %eax
- /*
- * This is a vmcall instruction (same thing that KVM uses). Older
- * assembler versions might not know the "vmcall" instruction, so we
- * create one manually here.
- */
- .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ /* This is the actual hypercall trap. */
+ int $LGUEST_TRAP_ENTRY
/* Put eax back the way we found it. */
popl %eax
ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2479f19ddd..b00f6785da7 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -18,8 +18,10 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
-lib-y += usercopy_$(BITS).o getuser.o putuser.o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-$(CONFIG_SMP) += rwlock.o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
obj-y += msr.o msr-reg.o msr-reg-export.o
@@ -29,7 +31,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
- lib-y += semaphore_32.o string_32.o
+ lib-y += string_32.o
lib-y += cmpxchg.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
@@ -40,7 +42,6 @@ else
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
- lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
- lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+ lib-y += copy_user_64.o copy_user_nocache_64.o
lib-y += cmpxchg16b_emu.o
endif
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
new file mode 100644
index 00000000000..1cad22139c8
--- /dev/null
+++ b/arch/x86/lib/rwlock.S
@@ -0,0 +1,44 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/rwlock.h>
+
+#ifdef CONFIG_X86_32
+# define __lock_ptr eax
+#else
+# define __lock_ptr rdi
+#endif
+
+ENTRY(__write_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
+1: rep; nop
+ cmpl $WRITE_LOCK_CMP, (%__lock_ptr)
+ jne 1b
+ LOCK_PREFIX
+ WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
+ jnz 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ READ_LOCK_SIZE(inc) (%__lock_ptr)
+1: rep; nop
+ READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
+ js 1b
+ LOCK_PREFIX
+ READ_LOCK_SIZE(dec) (%__lock_ptr)
+ js 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
deleted file mode 100644
index 05ea55f7140..00000000000
--- a/arch/x86/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- addl $RW_LOCK_BIAS,(%rdi)
-1: rep
- nop
- cmpl $RW_LOCK_BIAS,(%rdi)
- jne 1b
- LOCK_PREFIX
- subl $RW_LOCK_BIAS,(%rdi)
- jnz __write_lock_failed
- ret
- CFI_ENDPROC
-END(__write_lock_failed)
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- incl (%rdi)
-1: rep
- nop
- cmpl $1,(%rdi)
- js 1b
- LOCK_PREFIX
- decl (%rdi)
- js __read_lock_failed
- ret
- CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem.S
index 67743977398..5dff5f04246 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem.S
@@ -1,4 +1,51 @@
/*
+ * x86 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
+#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
+
+#ifdef CONFIG_X86_32
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+
+#define save_common_regs \
+ pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+
+#define restore_common_regs \
+ popl_cfi %ecx; CFI_RESTORE ecx
+
+ /* Avoid uglifying the argument copying x86-64 needs to do. */
+ .macro movq src, dst
+ .endm
+
+#else
+
+/*
* x86-64 rwsem wrappers
*
* This interfaces the inline asm code to the slow-path
@@ -16,12 +63,6 @@
* but %rdi, %rsi, %rcx, %r8-r11 always need saving.
*/
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
#define save_common_regs \
pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
@@ -40,16 +81,18 @@
popq_cfi %rsi; CFI_RESTORE rsi; \
popq_cfi %rdi; CFI_RESTORE rdi
+#endif
+
/* Fix up special calling conventions */
ENTRY(call_rwsem_down_read_failed)
CFI_STARTPROC
save_common_regs
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_down_read_failed
- popq_cfi %rdx
- CFI_RESTORE rdx
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
@@ -67,7 +110,8 @@ ENDPROC(call_rwsem_down_write_failed)
ENTRY(call_rwsem_wake)
CFI_STARTPROC
- decl %edx /* do nothing if still outstanding active readers */
+ /* do nothing if still outstanding active readers */
+ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
jnz 1f
save_common_regs
movq %rax,%rdi
@@ -77,16 +121,15 @@ ENTRY(call_rwsem_wake)
CFI_ENDPROC
ENDPROC(call_rwsem_wake)
-/* Fix up special calling conventions */
ENTRY(call_rwsem_downgrade_wake)
CFI_STARTPROC
save_common_regs
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_downgrade_wake
- popq_cfi %rdx
- CFI_RESTORE rdx
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
deleted file mode 100644
index 06691daa410..00000000000
--- a/arch/x86/lib/semaphore_32.S
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
- .section .sched.text, "ax"
-
-/*
- * rw spinlock fallbacks
- */
-#ifdef CONFIG_SMP
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- FRAME
-2: LOCK_PREFIX
- addl $ RW_LOCK_BIAS,(%eax)
-1: rep; nop
- cmpl $ RW_LOCK_BIAS,(%eax)
- jne 1b
- LOCK_PREFIX
- subl $ RW_LOCK_BIAS,(%eax)
- jnz 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__write_lock_failed)
-
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- FRAME
-2: LOCK_PREFIX
- incl (%eax)
-1: rep; nop
- cmpl $1,(%eax)
- js 1b
- LOCK_PREFIX
- decl (%eax)
- js 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__read_lock_failed)
-
-#endif
-
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_down_read_failed)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- pushl_cfi %edx
- CFI_REL_OFFSET edx,0
- call rwsem_down_read_failed
- popl_cfi %edx
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_read_failed)
-
-ENTRY(call_rwsem_down_write_failed)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- calll rwsem_down_write_failed
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_write_failed)
-
-ENTRY(call_rwsem_wake)
- CFI_STARTPROC
- decw %dx /* do nothing if still outstanding active readers */
- jnz 1f
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- call rwsem_wake
- popl_cfi %ecx
-1: ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_wake)
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_downgrade_wake)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- pushl_cfi %edx
- CFI_REL_OFFSET edx,0
- call rwsem_downgrade_wake
- popl_cfi %edx
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_downgrade_wake)
-
-#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 782b082c9ff..a63efd6bb6a 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -5,50 +5,41 @@
* Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
- #include <linux/linkage.h>
- #include <asm/dwarf2.h>
- #include <asm/calling.h>
- #include <asm/rwlock.h>
-
- /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
- .macro thunk name,func
- .globl \name
-\name:
- CFI_STARTPROC
- SAVE_ARGS
- call \func
- jmp restore
- CFI_ENDPROC
- .endm
-
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* put return address in rdi (arg1) */
- .macro thunk_ra name,func
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
\name:
CFI_STARTPROC
+
+ /* this one pushes 9 elems, the next one would be %rIP */
SAVE_ARGS
- /* SAVE_ARGS pushs 9 elements */
- /* the next element would be the rip */
- movq 9*8(%rsp), %rdi
+
+ .if \put_ret_addr_in_rdi
+ movq_cfi_restore 9*8, rdi
+ .endif
+
call \func
jmp restore
CFI_ENDPROC
.endm
- thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
- thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#ifdef CONFIG_TRACE_IRQFLAGS
+ THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- thunk lockdep_sys_exit_thunk,lockdep_sys_exit
+ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
-
+
/* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
SAVE_ARGS
restore:
RESTORE_ARGS
- ret
+ ret
CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 00000000000..97be9cb5448
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,43 @@
+/*
+ * User address space access functions.
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+/*
+ * best effort, GUP based copy_from_user() that is NMI-safe
+ */
+unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long offset, addr = (unsigned long)from;
+ unsigned long size, len = 0;
+ struct page *page;
+ void *map;
+ int ret;
+
+ do {
+ ret = __get_user_pages_fast(addr, 1, 0, &page);
+ if (!ret)
+ break;
+
+ offset = addr & (PAGE_SIZE - 1);
+ size = min(PAGE_SIZE - offset, n - len);
+
+ map = kmap_atomic(page);
+ memcpy(to, map+offset, size);
+ kunmap_atomic(map);
+ put_page(page);
+
+ len += size;
+ to += size;
+ addr += size;
+
+ } while (len < n);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2dbf6bf4c7e..4d09df054e3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
* If we're in an interrupt, have no user context or are running
@@ -1161,11 +1161,11 @@ good_area:
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, address);
} else {
tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, address);
}
if (fault & VM_FAULT_RETRY) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d865c4aeec5..bbaaa005bf0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -28,6 +28,7 @@
#include <linux/poison.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
+#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/nmi.h>
#include <linux/gfp.h>
@@ -895,8 +896,6 @@ const char *arch_vma_name(struct vm_area_struct *vma)
}
#ifdef CONFIG_X86_UV
-#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
-
unsigned long memory_block_size_bytes(void)
{
if (is_uv_system()) {
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 704a37ceddd..dab41876cdd 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_regs(&e->trace, regs);
+ save_stack_trace_regs(regs, &e->trace);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index e1d10690921..b0086567271 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -123,12 +123,11 @@ static int pageattr_test(void)
if (print)
printk(KERN_INFO "CPA self-test:\n");
- bm = vmalloc((max_pfn_mapped + 7) / 8);
+ bm = vzalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
return -ENOMEM;
}
- memset(bm, 0, (max_pfn_mapped + 7) / 8);
failed += print_split(&sa);
srandom32(100);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index a5b64ab4cd6..bff89dfe361 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -11,10 +11,11 @@
#include <linux/oprofile.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
#include <asm/ptrace.h>
-#include <asm/uaccess.h>
#include <asm/stacktrace.h>
-#include <linux/compat.h>
static int backtrace_stack(void *data, char *name)
{
@@ -40,13 +41,13 @@ static struct stacktrace_ops backtrace_ops = {
static struct stack_frame_ia32 *
dump_user_backtrace_32(struct stack_frame_ia32 *head)
{
+ /* Also check accessibility of one struct frame_head beyond: */
struct stack_frame_ia32 bufhead[2];
struct stack_frame_ia32 *fp;
+ unsigned long bytes;
- /* Also check accessibility of one struct frame_head beyond */
- if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
- return NULL;
- if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != sizeof(bufhead))
return NULL;
fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
@@ -87,12 +88,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
{
+ /* Also check accessibility of one struct frame_head beyond: */
struct stack_frame bufhead[2];
+ unsigned long bytes;
- /* Also check accessibility of one struct stack_frame beyond */
- if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
- return NULL;
- if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != sizeof(bufhead))
return NULL;
oprofile_add_trace(bufhead[0].return_address);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index f567965c062..1017c7bee38 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -1,8 +1,13 @@
/*
- * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
- * x86 PCI core to support the Xen PCI Frontend
+ * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and
+ * initial domain support. We also handle the DSDT _PRT callbacks for GSI's
+ * used in HVM and initial domain mode (PV does not parse ACPI, so it has no
+ * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and
+ * 0xcf8 PCI configuration read/write.
*
* Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Stefano Stabellini <stefano.stabellini@eu.citrix.com>
*/
#include <linux/module.h>
#include <linux/init.h>
@@ -19,22 +24,53 @@
#include <xen/events.h>
#include <asm/xen/pci.h>
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+ int rc;
+ int share = 1;
+ int pirq;
+ u8 gsi;
+
+ rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+ rc);
+ return rc;
+ }
+ /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
+ pirq = gsi;
+
+ if (gsi < NR_IRQS_LEGACY)
+ share = 0;
+
+ rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
+ gsi, pirq, rc);
+ return rc;
+ }
+
+ dev->irq = rc;
+ dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
+ return 0;
+}
+
#ifdef CONFIG_ACPI
-static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
- int trigger, int polarity)
+static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
+ bool set_pirq)
{
- int rc, irq;
+ int rc, pirq = -1, irq = -1;
struct physdev_map_pirq map_irq;
int shareable = 0;
char *name;
- if (!xen_hvm_domain())
- return -1;
+ if (set_pirq)
+ pirq = gsi;
map_irq.domid = DOMID_SELF;
map_irq.type = MAP_PIRQ_TYPE_GSI;
map_irq.index = gsi;
- map_irq.pirq = -1;
+ map_irq.pirq = pirq;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
if (rc) {
@@ -42,7 +78,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
return -1;
}
- if (trigger == ACPI_EDGE_SENSITIVE) {
+ if (triggering == ACPI_EDGE_SENSITIVE) {
shareable = 0;
name = "ioapic-edge";
} else {
@@ -50,12 +86,63 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
name = "ioapic-level";
}
+ if (gsi_override >= 0)
+ gsi = gsi_override;
+
irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
+ if (irq < 0)
+ goto out;
+
+ printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi);
+out:
+ return irq;
+}
+
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ if (!xen_hvm_domain())
+ return -1;
- printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+ return xen_register_pirq(gsi, -1 /* no GSI override */, trigger,
+ false /* no mapping of GSI to PIRQ */);
+}
+
+#ifdef CONFIG_XEN_DOM0
+static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
+{
+ int rc, irq;
+ struct physdev_setup_gsi setup_gsi;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+ gsi, triggering, polarity);
+
+ irq = xen_register_pirq(gsi, gsi_override, triggering, true);
+
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (rc == -EEXIST)
+ printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+ else if (rc) {
+ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+ gsi, rc);
+ }
return irq;
}
+
+static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
+}
+#endif
#endif
#if defined(CONFIG_PCI_MSI)
@@ -65,6 +152,43 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
struct xen_pci_frontend_ops *xen_pci_frontend;
EXPORT_SYMBOL_GPL(xen_pci_frontend);
+static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret, i;
+ struct msi_desc *msidesc;
+ int *v;
+
+ v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ if (type == PCI_CAP_ID_MSIX)
+ ret = xen_pci_frontend_enable_msix(dev, v, nvec);
+ else
+ ret = xen_pci_frontend_enable_msi(dev, v);
+ if (ret)
+ goto error;
+ i = 0;
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
+ (type == PCI_CAP_ID_MSIX) ?
+ "pcifront-msi-x" :
+ "pcifront-msi",
+ DOMID_SELF);
+ if (irq < 0)
+ goto free;
+ i++;
+ }
+ kfree(v);
+ return 0;
+
+error:
+ dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+free:
+ kfree(v);
+ return ret;
+}
+
#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
@@ -123,67 +247,6 @@ error:
return -ENODEV;
}
-/*
- * For MSI interrupts we have to use drivers/xen/event.s functions to
- * allocate an irq_desc and setup the right */
-
-
-static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
- int irq, ret, i;
- struct msi_desc *msidesc;
- int *v;
-
- v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
- if (!v)
- return -ENOMEM;
-
- if (type == PCI_CAP_ID_MSIX)
- ret = xen_pci_frontend_enable_msix(dev, v, nvec);
- else
- ret = xen_pci_frontend_enable_msi(dev, v);
- if (ret)
- goto error;
- i = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
- (type == PCI_CAP_ID_MSIX) ?
- "pcifront-msi-x" :
- "pcifront-msi",
- DOMID_SELF);
- if (irq < 0)
- goto free;
- i++;
- }
- kfree(v);
- return 0;
-
-error:
- dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
-free:
- kfree(v);
- return ret;
-}
-
-static void xen_teardown_msi_irqs(struct pci_dev *dev)
-{
- struct msi_desc *msidesc;
-
- msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
- if (msidesc->msi_attrib.is_msix)
- xen_pci_frontend_disable_msix(dev);
- else
- xen_pci_frontend_disable_msi(dev);
-
- /* Free the IRQ's and the msidesc using the generic code. */
- default_teardown_msi_irqs(dev);
-}
-
-static void xen_teardown_msi_irq(unsigned int irq)
-{
- xen_destroy_irq(irq);
-}
-
#ifdef CONFIG_XEN_DOM0
static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
@@ -242,45 +305,28 @@ out:
return ret;
}
#endif
-#endif
-static int xen_pcifront_enable_irq(struct pci_dev *dev)
+static void xen_teardown_msi_irqs(struct pci_dev *dev)
{
- int rc;
- int share = 1;
- int pirq;
- u8 gsi;
-
- rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
- rc);
- return rc;
- }
-
- rc = xen_allocate_pirq_gsi(gsi);
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
- gsi, rc);
- return rc;
- }
- pirq = rc;
+ struct msi_desc *msidesc;
- if (gsi < NR_IRQS_LEGACY)
- share = 0;
+ msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+ if (msidesc->msi_attrib.is_msix)
+ xen_pci_frontend_disable_msix(dev);
+ else
+ xen_pci_frontend_disable_msi(dev);
- rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
- gsi, pirq, rc);
- return rc;
- }
+ /* Free the IRQ's and the msidesc using the generic code. */
+ default_teardown_msi_irqs(dev);
+}
- dev->irq = rc;
- dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
- return 0;
+static void xen_teardown_msi_irq(unsigned int irq)
+{
+ xen_destroy_irq(irq);
}
+#endif
+
int __init pci_xen_init(void)
{
if (!xen_pv_domain() || xen_initial_domain())
@@ -327,79 +373,6 @@ int __init pci_xen_hvm_init(void)
}
#ifdef CONFIG_XEN_DOM0
-static int xen_register_pirq(u32 gsi, int gsi_override, int triggering)
-{
- int rc, pirq, irq = -1;
- struct physdev_map_pirq map_irq;
- int shareable = 0;
- char *name;
-
- if (!xen_pv_domain())
- return -1;
-
- if (triggering == ACPI_EDGE_SENSITIVE) {
- shareable = 0;
- name = "ioapic-edge";
- } else {
- shareable = 1;
- name = "ioapic-level";
- }
- pirq = xen_allocate_pirq_gsi(gsi);
- if (pirq < 0)
- goto out;
-
- if (gsi_override >= 0)
- irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name);
- else
- irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
- if (irq < 0)
- goto out;
-
- printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi);
-
- map_irq.domid = DOMID_SELF;
- map_irq.type = MAP_PIRQ_TYPE_GSI;
- map_irq.index = gsi;
- map_irq.pirq = pirq;
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
- if (rc) {
- printk(KERN_WARNING "xen map irq failed %d\n", rc);
- return -1;
- }
-
-out:
- return irq;
-}
-
-static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
-{
- int rc, irq;
- struct physdev_setup_gsi setup_gsi;
-
- if (!xen_pv_domain())
- return -1;
-
- printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
- gsi, triggering, polarity);
-
- irq = xen_register_pirq(gsi, gsi_override, triggering);
-
- setup_gsi.gsi = gsi;
- setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
- setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
- if (rc == -EEXIST)
- printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
- else if (rc) {
- printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
- gsi, rc);
- }
-
- return irq;
-}
-
static __init void xen_setup_acpi_sci(void)
{
int rc;
@@ -419,7 +392,7 @@ static __init void xen_setup_acpi_sci(void)
}
trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
-
+
printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
"polarity=%d\n", gsi, trigger, polarity);
@@ -434,10 +407,9 @@ static __init void xen_setup_acpi_sci(void)
* the ACPI interpreter and keels over since IRQ 9 has not been
* setup as we had setup IRQ 20 for it).
*/
- /* Check whether the GSI != IRQ */
if (acpi_gsi_to_irq(gsi, &irq) == 0) {
- if (irq >= 0 && irq != gsi)
- /* Bugger, we MUST have that IRQ. */
+ /* Use the provided value if it's valid. */
+ if (irq >= 0)
gsi_override = irq;
}
@@ -447,41 +419,16 @@ static __init void xen_setup_acpi_sci(void)
return;
}
-static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
- int trigger, int polarity)
+int __init pci_xen_initial_domain(void)
{
- return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
-}
+ int irq;
-static int __init pci_xen_initial_domain(void)
-{
#ifdef CONFIG_PCI_MSI
x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
#endif
xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
-
- return 0;
-}
-
-void __init xen_setup_pirqs(void)
-{
- int pirq, irq;
-
- pci_xen_initial_domain();
-
- if (0 == nr_ioapics) {
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
- pirq = xen_allocate_pirq_gsi(irq);
- if (WARN(pirq < 0,
- "Could not allocate PIRQ for legacy interrupt\n"))
- break;
- irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
- }
- return;
- }
-
/* Pre-allocate legacy irqs */
for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
int trigger, polarity;
@@ -490,12 +437,16 @@ void __init xen_setup_pirqs(void)
continue;
xen_register_pirq(irq, -1 /* no GSI override */,
- trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
+ true /* Map GSI to PIRQ */);
}
+ if (0 == nr_ioapics) {
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+ xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
+ }
+ return 0;
}
-#endif
-#ifdef CONFIG_XEN_DOM0
struct xen_device_domain_owner {
domid_t domain;
struct pci_dev *dev;
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 899e393d8e7..3ae4128013e 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -51,7 +51,17 @@
int efi_enabled;
EXPORT_SYMBOL(efi_enabled);
-struct efi efi;
+struct efi __read_mostly efi = {
+ .mps = EFI_INVALID_TABLE_ADDR,
+ .acpi = EFI_INVALID_TABLE_ADDR,
+ .acpi20 = EFI_INVALID_TABLE_ADDR,
+ .smbios = EFI_INVALID_TABLE_ADDR,
+ .sal_systab = EFI_INVALID_TABLE_ADDR,
+ .boot_info = EFI_INVALID_TABLE_ADDR,
+ .hcdp = EFI_INVALID_TABLE_ADDR,
+ .uga = EFI_INVALID_TABLE_ADDR,
+ .uv_systab = EFI_INVALID_TABLE_ADDR,
+};
EXPORT_SYMBOL(efi);
struct efi_memory_map memmap;
@@ -79,26 +89,50 @@ early_param("add_efi_memmap", setup_add_efi_memmap);
static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
- return efi_call_virt2(get_time, tm, tc);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt2(get_time, tm, tc);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_set_time(efi_time_t *tm)
{
- return efi_call_virt1(set_time, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt1(set_time, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
efi_bool_t *pending,
efi_time_t *tm)
{
- return efi_call_virt3(get_wakeup_time,
- enabled, pending, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt3(get_wakeup_time,
+ enabled, pending, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
{
- return efi_call_virt2(set_wakeup_time,
- enabled, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt2(set_wakeup_time,
+ enabled, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_get_variable(efi_char16_t *name,
@@ -122,7 +156,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
static efi_status_t virt_efi_set_variable(efi_char16_t *name,
efi_guid_t *vendor,
- unsigned long attr,
+ u32 attr,
unsigned long data_size,
void *data)
{
@@ -131,6 +165,18 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
data_size, data);
}
+static efi_status_t virt_efi_query_variable_info(u32 attr,
+ u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt4(query_variable_info, attr, storage_space,
+ remaining_space, max_variable_size);
+}
+
static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
{
return efi_call_virt1(get_next_high_mono_count, count);
@@ -145,6 +191,28 @@ static void virt_efi_reset_system(int reset_type,
data_size, data);
}
+static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
+ unsigned long count,
+ unsigned long sg_list)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt3(update_capsule, capsules, count, sg_list);
+}
+
+static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+ unsigned long count,
+ u64 *max_size,
+ int *reset_type)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
+ reset_type);
+}
+
static efi_status_t __init phys_efi_set_virtual_address_map(
unsigned long memory_map_size,
unsigned long descriptor_size,
@@ -164,11 +232,14 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
efi_time_cap_t *tc)
{
+ unsigned long flags;
efi_status_t status;
+ spin_lock_irqsave(&rtc_lock, flags);
efi_call_phys_prelog();
status = efi_call_phys2(efi_phys.get_time, tm, tc);
efi_call_phys_epilog();
+ spin_unlock_irqrestore(&rtc_lock, flags);
return status;
}
@@ -669,6 +740,9 @@ void __init efi_enter_virtual_mode(void)
efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
efi.reset_system = virt_efi_reset_system;
efi.set_virtual_address_map = NULL;
+ efi.query_variable_info = virt_efi_query_variable_info;
+ efi.update_capsule = virt_efi_update_capsule;
+ efi.query_capsule_caps = virt_efi_query_capsule_caps;
if (__supported_pte_mask & _PAGE_NX)
runtime_code_page_mkexec();
early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 17c565de3d6..a6575b949b1 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -18,5 +18,5 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5525163a039..53257421082 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1248,6 +1248,14 @@ asmlinkage void __init xen_start_kernel(void)
if (pci_xen)
x86_init.pci.arch_init = pci_xen_init;
} else {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+
+ xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+
/* Make sure ACS will be enabled */
pci_request_acs();
}
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 25c52f94a27..ffcf2615640 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
#ifdef CONFIG_XEN_PVHVM
static int xen_emul_unplug;
-static int __init check_platform_magic(void)
+static int check_platform_magic(void)
{
short magic;
char protocol;
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 00000000000..1cd7f4d11e2
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,67 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+ struct screen_info *screen_info = &boot_params.screen_info;
+
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info->orig_video_mode = 3;
+ screen_info->orig_video_isVGA = 1;
+ screen_info->orig_video_lines = 25;
+ screen_info->orig_video_cols = 80;
+ screen_info->orig_video_ega_bx = 3;
+ screen_info->orig_video_points = 16;
+ screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info->orig_video_lines = info->u.text_mode_3.rows;
+ screen_info->orig_video_cols = info->u.text_mode_3.columns;
+ screen_info->orig_x = info->u.text_mode_3.cursor_x;
+ screen_info->orig_y = info->u.text_mode_3.cursor_y;
+ screen_info->orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info->lfb_width = info->u.vesa_lfb.width;
+ screen_info->lfb_height = info->u.vesa_lfb.height;
+ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info->red_size = info->u.vesa_lfb.red_size;
+ screen_info->red_pos = info->u.vesa_lfb.red_pos;
+ screen_info->green_size = info->u.vesa_lfb.green_size;
+ screen_info->green_pos = info->u.vesa_lfb.green_pos;
+ screen_info->blue_size = info->u.vesa_lfb.blue_size;
+ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 97dfdc8757b..b095739ccd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu)
}
#endif
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+ size_t size)
+{
+}
+#endif
+
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \