diff options
497 files changed, 10926 insertions, 4893 deletions
diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX new file mode 100644 index 00000000000..641ec922017 --- /dev/null +++ b/Documentation/virtual/kvm/00-INDEX @@ -0,0 +1,24 @@ +00-INDEX + - this file. +api.txt + - KVM userspace API. +cpuid.txt + - KVM-specific cpuid leaves (x86). +devices/ + - KVM_CAP_DEVICE_CTRL userspace API. +hypercalls.txt + - KVM hypercalls. +locking.txt + - notes on KVM locks. +mmu.txt + - the x86 kvm shadow mmu. +msr.txt + - KVM-specific MSRs (x86). +nested-vmx.txt + - notes on nested virtualization for Intel x86 processors. +ppc-pv.txt + - the paravirtualization interface on PowerPC. +review-checklist.txt + - review checklist for KVM patches. +timekeeping.txt + - timekeeping virtualization for x86-based architectures. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 858aecf21db..a30035dd4c2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1122,9 +1122,9 @@ struct kvm_cpuid2 { struct kvm_cpuid_entry2 entries[0]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) struct kvm_cpuid_entry2 { __u32 function; @@ -1810,6 +1810,50 @@ registers, find a list below: PPC | KVM_REG_PPC_TLB3PS | 32 PPC | KVM_REG_PPC_EPTCFG | 32 PPC | KVM_REG_PPC_ICP_STATE | 64 + PPC | KVM_REG_PPC_TB_OFFSET | 64 + PPC | KVM_REG_PPC_SPMC1 | 32 + PPC | KVM_REG_PPC_SPMC2 | 32 + PPC | KVM_REG_PPC_IAMR | 64 + PPC | KVM_REG_PPC_TFHAR | 64 + PPC | KVM_REG_PPC_TFIAR | 64 + PPC | KVM_REG_PPC_TEXASR | 64 + PPC | KVM_REG_PPC_FSCR | 64 + PPC | KVM_REG_PPC_PSPB | 32 + PPC | KVM_REG_PPC_EBBHR | 64 + PPC | KVM_REG_PPC_EBBRR | 64 + PPC | KVM_REG_PPC_BESCR | 64 + PPC | KVM_REG_PPC_TAR | 64 + PPC | KVM_REG_PPC_DPDES | 64 + PPC | KVM_REG_PPC_DAWR | 64 + PPC | KVM_REG_PPC_DAWRX | 64 + PPC | KVM_REG_PPC_CIABR | 64 + PPC | KVM_REG_PPC_IC | 64 + PPC | KVM_REG_PPC_VTB | 64 + PPC | KVM_REG_PPC_CSIGR | 64 + PPC | KVM_REG_PPC_TACR | 64 + PPC | KVM_REG_PPC_TCSCR | 64 + PPC | KVM_REG_PPC_PID | 64 + PPC | KVM_REG_PPC_ACOP | 64 + PPC | KVM_REG_PPC_VRSAVE | 32 + PPC | KVM_REG_PPC_LPCR | 64 + PPC | KVM_REG_PPC_PPR | 64 + PPC | KVM_REG_PPC_ARCH_COMPAT 32 + PPC | KVM_REG_PPC_TM_GPR0 | 64 + ... + PPC | KVM_REG_PPC_TM_GPR31 | 64 + PPC | KVM_REG_PPC_TM_VSR0 | 128 + ... + PPC | KVM_REG_PPC_TM_VSR63 | 128 + PPC | KVM_REG_PPC_TM_CR | 64 + PPC | KVM_REG_PPC_TM_LR | 64 + PPC | KVM_REG_PPC_TM_CTR | 64 + PPC | KVM_REG_PPC_TM_FPSCR | 64 + PPC | KVM_REG_PPC_TM_AMR | 64 + PPC | KVM_REG_PPC_TM_PPR | 64 + PPC | KVM_REG_PPC_TM_VRSAVE | 64 + PPC | KVM_REG_PPC_TM_VSCR | 32 + PPC | KVM_REG_PPC_TM_DSCR | 64 + PPC | KVM_REG_PPC_TM_TAR | 64 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: @@ -2304,7 +2348,31 @@ Possible features: Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). -4.83 KVM_GET_REG_LIST +4.83 KVM_ARM_PREFERRED_TARGET + +Capability: basic +Architectures: arm, arm64 +Type: vm ioctl +Parameters: struct struct kvm_vcpu_init (out) +Returns: 0 on success; -1 on error +Errors: + ENODEV: no preferred target available for the host + +This queries KVM for preferred CPU target type which can be emulated +by KVM on underlying host. + +The ioctl returns struct kvm_vcpu_init instance containing information +about preferred CPU target type and recommended features for it. The +kvm_vcpu_init->features bitmap returned will have feature bits set if +the preferred target recommends setting these features, but this is +not mandatory. + +The information returned by this ioctl can be used to prepare an instance +of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in +in VCPU matching underlying host. + + +4.84 KVM_GET_REG_LIST Capability: basic Architectures: arm, arm64 @@ -2323,8 +2391,7 @@ struct kvm_reg_list { This ioctl returns the guest registers that are supported for the KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. - -4.84 KVM_ARM_SET_DEVICE_ADDR +4.85 KVM_ARM_SET_DEVICE_ADDR Capability: KVM_CAP_ARM_SET_DEVICE_ADDR Architectures: arm, arm64 @@ -2362,7 +2429,7 @@ must be called after calling KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the base addresses will return -EEXIST. -4.85 KVM_PPC_RTAS_DEFINE_TOKEN +4.86 KVM_PPC_RTAS_DEFINE_TOKEN Capability: KVM_CAP_PPC_RTAS Architectures: ppc @@ -2661,6 +2728,77 @@ and usually define the validity of a groups of registers. (e.g. one bit }; +4.81 KVM_GET_EMULATED_CPUID + +Capability: KVM_CAP_EXT_EMUL_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 flags; + struct kvm_cpuid_entry2 entries[0]; +}; + +The member 'flags' is used for passing flags from userspace. + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are emulated by +kvm.Userspace can use the information returned by this ioctl to query +which features are emulated by kvm instead of being present natively. + +Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 +structure with the 'nent' field indicating the number of entries in +the variable-size array 'entries'. If the number of entries is too low +to describe the cpu capabilities, an error (E2BIG) is returned. If the +number is too high, the 'nent' field is adjusted and an error (ENOMEM) +is returned. If the number is just right, the 'nent' field is adjusted +to the number of valid entries in the 'entries' array, which is then +filled. + +The entries returned are the set CPUID bits of the respective features +which kvm emulates, as returned by the CPUID instruction, with unknown +or unsupported feature bits cleared. + +Features like x2apic, for example, may not be present in the host cpu +but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be +emulated efficiently and thus not included here. + +The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + + 6. Capabilities that can be enabled ----------------------------------- diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 22ff659bc0f..3c65feb8301 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -43,6 +43,13 @@ KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by || || writing to msr 0x4b564d02 ------------------------------------------------------------------------------ +KVM_FEATURE_STEAL_TIME || 5 || steal time can be enabled by + || || writing to msr 0x4b564d03. +------------------------------------------------------------------------------ +KVM_FEATURE_PV_EOI || 6 || paravirtualized end of interrupt + || || handler can be enabled by writing + || || to msr 0x4b564d04. +------------------------------------------------------------------------------ KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit || || before enabling paravirtualized || || spinlock support. diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt new file mode 100644 index 00000000000..ef51740c67c --- /dev/null +++ b/Documentation/virtual/kvm/devices/vfio.txt @@ -0,0 +1,22 @@ +VFIO virtual device +=================== + +Device types supported: + KVM_DEV_TYPE_VFIO + +Only one VFIO instance may be created per VM. The created device +tracks VFIO groups in use by the VM and features of those groups +important to the correctness and acceleration of the VM. As groups +are enabled and disabled for use by the VM, KVM should be updated +about their presence. When registered with KVM, a reference to the +VFIO-group is held by KVM. + +Groups: + KVM_DEV_VFIO_GROUP + +KVM_DEV_VFIO_GROUP attributes: + KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + +For each, kvm_device_attr.addr points to an int32_t file descriptor +for the VFIO group. diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 41b7ac9884b..f8869410d40 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -132,10 +132,14 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). ------------ Name: kvm_lock -Type: raw_spinlock +Type: spinlock_t Arch: any Protects: - vm_list - - hardware virtualization enable/disable + +Name: kvm_count_lock +Type: raw_spinlock_t +Arch: any +Protects: - hardware virtualization enable/disable Comment: 'raw' because hardware enabling/disabling must be atomic /wrt migration. @@ -151,3 +155,14 @@ Type: spinlock_t Arch: any Protects: -shadow page/shadow tlb entry Comment: it is a spinlock since it is used in mmu notifier. + +Name: kvm->srcu +Type: srcu lock +Arch: any +Protects: - kvm->memslots + - kvm->buses +Comment: The srcu read lock must be held while accessing memslots (e.g. + when using gfn_to_* functions) and while accessing in-kernel + MMIO/PIO address->device structure mapping (kvm->buses). + The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu + if it is needed by multiple functions. diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock new file mode 100644 index 00000000000..7521d367f21 --- /dev/null +++ b/Documentation/vm/split_page_table_lock @@ -0,0 +1,94 @@ +Split page table lock +===================== + +Originally, mm->page_table_lock spinlock protected all page tables of the +mm_struct. But this approach leads to poor page fault scalability of +multi-threaded applications due high contention on the lock. To improve +scalability, split page table lock was introduced. + +With split page table lock we have separate per-table lock to serialize +access to the table. At the moment we use split lock for PTE and PMD +tables. Access to higher level tables protected by mm->page_table_lock. + +There are helpers to lock/unlock a table and other accessor functions: + - pte_offset_map_lock() + maps pte and takes PTE table lock, returns pointer to the taken + lock; + - pte_unmap_unlock() + unlocks and unmaps PTE table; + - pte_alloc_map_lock() + allocates PTE table if needed and take the lock, returns pointer + to taken lock or NULL if allocation failed; + - pte_lockptr() + returns pointer to PTE table lock; + - pmd_lock() + takes PMD table lock, returns pointer to taken lock; + - pmd_lockptr() + returns pointer to PMD table lock; + +Split page table lock for PTE tables is enabled compile-time if +CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. +If split lock is disabled, all tables guaded by mm->page_table_lock. + +Split page table lock for PMD tables is enabled, if it's enabled for PTE +tables and the architecture supports it (see below). + +Hugetlb and split page table lock +--------------------------------- + +Hugetlb can support several page sizes. We use split lock only for PMD +level, but not for PUD. + +Hugetlb-specific helpers: + - huge_pte_lock() + takes pmd split lock for PMD_SIZE page, mm->page_table_lock + otherwise; + - huge_pte_lockptr() + returns pointer to table lock; + +Support of split page table lock by an architecture +--------------------------------------------------- + +There's no need in special enabling of PTE split page table lock: +everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), +which must be called on PTE table allocation / freeing. + +Make sure the architecture doesn't use slab allocator for page table +allocation: slab uses page->slab_cache and page->first_page for its pages. +These fields share storage with page->ptl. + +PMD split lock only makes sense if you have more than two page table +levels. + +PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table +allocation and pgtable_pmd_page_dtor() on freeing. + +Allocation usually happens in pmd_alloc_one(), freeing in pmd_free(), but +make sure you cover all PMD table allocation / freeing paths: i.e X86_PAE +preallocate few PMDs on pgd_alloc(). + +With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. + +NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +be handled properly. + +page->ptl +--------- + +page->ptl is used to access split page table lock, where 'page' is struct +page of page containing the table. It shares storage with page->private +(and few other fields in union). + +To avoid increasing size of struct page and have best performance, we use a +trick: + - if spinlock_t fits into long, we use page->ptr as spinlock, so we + can avoid indirect access and save a cache line. + - if size of spinlock_t is bigger then size of long, we use page->ptl as + pointer to spinlock_t and allocate it dynamically. This allows to use + split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs + one more cache line for indirect access; + +The spinlock_t allocated in pgtable_page_ctor() for PTE table and in +pgtable_pmd_page_ctor() for PMD table. + +Please, never access page->ptl directly -- use appropriate helper. diff --git a/MAINTAINERS b/MAINTAINERS index f3ef1d1f602..583af4b72ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4871,7 +4871,8 @@ KERNEL VIRTUAL MACHINE (KVM) M: Gleb Natapov <gleb@redhat.com> M: Paolo Bonzini <pbonzini@redhat.com> L: kvm@vger.kernel.org -W: http://linux-kvm.org +W: http://www.linux-kvm.org +T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git S: Supported F: Documentation/*/kvm*.txt F: Documentation/virtual/kvm/ diff --git a/arch/Kconfig b/arch/Kconfig index ded747c7b74..f1cf895c040 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -207,9 +207,6 @@ config HAVE_DMA_ATTRS config HAVE_DMA_CONTIGUOUS bool -config USE_GENERIC_SMP_HELPERS - bool - config GENERIC_SMP_IDLE_THREAD bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 35a300d4a9f..8d2a4833acd 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -522,7 +522,6 @@ config ARCH_MAY_HAVE_PC_FDC config SMP bool "Symmetric multi-processing support" depends on ALPHA_SABLE || ALPHA_LYNX || ALPHA_RAWHIDE || ALPHA_DP264 || ALPHA_WILDFIRE || ALPHA_TITAN || ALPHA_GENERIC || ALPHA_SHARK || ALPHA_MARVEL - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index bc2a0daf2d9..aab14a019c2 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -72,7 +72,10 @@ pte_alloc_one(struct mm_struct *mm, unsigned long address) if (!pte) return NULL; page = virt_to_page(pte); - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } return page; } diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 5ede5460c80..2ee0c9bfd03 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -125,7 +125,6 @@ config ARC_PLAT_NEEDS_CPU_TO_DMA config SMP bool "Symmetric Multi-Processing (Incomplete)" default n - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 36a9f20c21a..81208bfd9dc 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -105,11 +105,16 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { pgtable_t pte_pg; + struct page *page; pte_pg = __get_free_pages(GFP_KERNEL | __GFP_REPEAT, __get_order_pte()); - if (pte_pg) { - memzero((void *)pte_pg, PTRS_PER_PTE * 4); - pgtable_page_ctor(virt_to_page(pte_pg)); + if (!pte_pg) + return 0; + memzero((void *)pte_pg, PTRS_PER_PTE * 4); + page = virt_to_page(pte_pg); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return 0; } return pte_pg; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 603d661b445..e089e622be7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1432,7 +1432,6 @@ config SMP depends on GENERIC_CLOCKEVENTS depends on HAVE_SMP depends on MMU || ARM_MPU - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If @@ -1863,6 +1862,12 @@ config CC_STACKPROTECTOR neutralized via a kernel panic. This feature requires gcc version 4.2 or above. +config SWIOTLB + def_bool y + +config IOMMU_HELPER + def_bool SWIOTLB + config XEN_DOM0 def_bool y depends on XEN @@ -1873,6 +1878,7 @@ config XEN depends on CPU_V7 && !CPU_V6 depends on !GENERIC_ATOMIC64 select ARM_PSCI + select SWIOTLB_XEN help Say Y if you want to run Linux in a Virtual Machine on Xen on ARM. diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index 863cd84eb1a..e701a4d9aa5 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -11,17 +11,28 @@ #include <asm-generic/dma-coherent.h> #include <asm/memory.h> +#include <xen/xen.h> +#include <asm/xen/hypervisor.h> + #define DMA_ERROR_CODE (~0) extern struct dma_map_ops arm_dma_ops; extern struct dma_map_ops arm_coherent_dma_ops; -static inline struct dma_map_ops *get_dma_ops(struct device *dev) +static inline struct dma_map_ops *__generic_dma_ops(struct device *dev) { if (dev && dev->archdata.dma_ops) return dev->archdata.dma_ops; return &arm_dma_ops; } +static inline struct dma_map_ops *get_dma_ops(struct device *dev) +{ + if (xen_initial_domain()) + return xen_dma_ops; + else + return __generic_dma_ops(dev); +} + static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) { BUG_ON(!dev); @@ -94,6 +105,39 @@ static inline unsigned long dma_max_pfn(struct device *dev) } #define dma_max_pfn(dev) dma_max_pfn(dev) +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + unsigned int offset = paddr & ~PAGE_MASK; + return pfn_to_dma(dev, __phys_to_pfn(paddr)) + offset; +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr) +{ + unsigned int offset = dev_addr & ~PAGE_MASK; + return __pfn_to_phys(dma_to_pfn(dev, dev_addr)) + offset; +} + +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + u64 limit, mask; + + if (!dev->dma_mask) + return 0; + + mask = *dev->dma_mask; + + limit = (mask + 1) & ~mask; + if (limit && size > limit) + return 0; + + if ((addr | (addr + size - 1)) & ~mask) + return 0; + + return 1; +} + +static inline void dma_mark_clean(void *addr, size_t size) { } + /* * DMA errors are defined by all-bits-set in the DMA address. */ diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index d070741b2b3..3c597c222ef 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -24,9 +24,11 @@ #ifdef __KERNEL__ #include <linux/types.h> +#include <linux/blk_types.h> #include <asm/byteorder.h> #include <asm/memory.h> #include <asm-generic/pci_iomap.h> +#include <xen/xen.h> /* * ISA I/O bus memory addresses are 1:1 with the physical address. @@ -372,6 +374,13 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr); #define BIOVEC_MERGEABLE(vec1, vec2) \ ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) +struct bio_vec; +extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, + const struct bio_vec *vec2); +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ + (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ + (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) + #ifdef CONFIG_MMU #define ARCH_HAS_VALID_PHYS_ADDR_RANGE extern int valid_phys_addr_range(phys_addr_t addr, size_t size); diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 64e96960de2..1d3153c7eb4 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -57,6 +57,7 @@ * TSC: Trap SMC * TSW: Trap cache operations by set/way * TWI: Trap WFI + * TWE: Trap WFE * TIDCP: Trap L2CTLR/L2ECTLR * BSU_IS: Upgrade barriers to the inner shareable domain * FB: Force broadcast of all maintainance operations @@ -67,7 +68,7 @@ */ #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ - HCR_SWIO | HCR_TIDCP) + HCR_TWE | HCR_SWIO | HCR_TIDCP) #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) /* System Control Register (SCTLR) bits */ @@ -95,12 +96,12 @@ #define TTBCR_IRGN1 (3 << 24) #define TTBCR_EPD1 (1 << 23) #define TTBCR_A1 (1 << 22) -#define TTBCR_T1SZ (3 << 16) +#define TTBCR_T1SZ (7 << 16) #define TTBCR_SH0 (3 << 12) #define TTBCR_ORGN0 (3 << 10) #define TTBCR_IRGN0 (3 << 8) #define TTBCR_EPD0 (1 << 7) -#define TTBCR_T0SZ 3 +#define TTBCR_T0SZ (7 << 0) #define HTCR_MASK (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0) /* Hyp System Trap Register */ @@ -208,6 +209,8 @@ #define HSR_EC_DABT (0x24) #define HSR_EC_DABT_HYP (0x25) +#define HSR_WFI_IS_WFE (1U << 0) + #define HSR_HVC_IMM_MASK ((1UL << 16) - 1) #define HSR_DABT_S1PTW (1U << 7) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index a2f43ddcc30..661da11f76f 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -39,7 +39,7 @@ #define c6_IFAR 17 /* Instruction Fault Address Register */ #define c7_PAR 18 /* Physical Address Register */ #define c7_PAR_high 19 /* PAR top 32 bits */ -#define c9_L2CTLR 20 /* Cortex A15 L2 Control Register */ +#define c9_L2CTLR 20 /* Cortex A15/A7 L2 Control Register */ #define c10_PRRR 21 /* Primary Region Remap Register */ #define c10_NMRR 22 /* Normal Memory Remap Register */ #define c12_VBAR 23 /* Vector Base Address Register */ diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index a464e8d7b6c..0fa90c962ac 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -157,4 +157,55 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; } +static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cp15[c0_MPIDR]; +} + +static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) +{ + *vcpu_cpsr(vcpu) |= PSR_E_BIT; +} + +static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) +{ + return !!(*vcpu_cpsr(vcpu) & PSR_E_BIT); +} + +static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return be16_to_cpu(data & 0xffff); + default: + return be32_to_cpu(data); + } + } + + return data; /* Leave LE untouched */ +} + +static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return cpu_to_be16(data & 0xffff); + default: + return cpu_to_be32(data); + } + } + + return data; /* Leave LE untouched */ +} + #endif /* __ARM_KVM_EMULATE_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 7d22517d807..8a6f6db14ee 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -38,11 +38,6 @@ #define KVM_VCPU_MAX_FEATURES 1 -/* We don't currently support large pages. */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) - #include <kvm/arm_vgic.h> struct kvm_vcpu; @@ -154,6 +149,7 @@ struct kvm_vcpu_stat { struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); struct kvm_one_reg; diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9b28c41f4ba..77de4a41cc5 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); +static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) +{ + *pmd = new_pmd; + flush_pmd_entry(pmd); +} + static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) { *pte = new_pte; @@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) pte_val(*pte) |= L_PTE_S2_RDWR; } +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +{ + pmd_val(*pmd) |= L_PMD_S2_RDWR; +} + struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, + unsigned long size) { /* * If we are going to insert an instruction page and the icache is @@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) * need any kind of flushing (DDI 0406C.b - Page B3-1392). */ if (icache_is_pipt()) { - unsigned long hva = gfn_to_hva(kvm, gfn); - __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); + __cpuc_coherent_user_range(hva, hva + size); } else if (!icache_is_vivt_asid_tagged()) { /* any kind of VIPT cache */ __flush_icache_all(); diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 943504f53f5..78a77936168 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -102,12 +102,14 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr) #else pte = alloc_pages(PGALLOC_GFP, 0); #endif - if (pte) { - if (!PageHighMem(pte)) - clean_pte_table(page_address(pte)); - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!PageHighMem(pte)) + clean_pte_table(page_address(pte)); + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; } - return pte; } diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 39c54cfa03e..4f9503908dc 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -126,6 +126,8 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ + /* * Hyp-mode PL2 PTE definitions for LPAE. */ diff --git a/arch/arm/include/asm/xen/hypervisor.h b/arch/arm/include/asm/xen/hypervisor.h index d7ab99a0c9e..1317ee40f4d 100644 --- a/arch/arm/include/asm/xen/hypervisor.h +++ b/arch/arm/include/asm/xen/hypervisor.h @@ -16,4 +16,6 @@ static inline enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return PARAVIRT_LAZY_NONE; } +extern struct dma_map_ops *xen_dma_ops; + #endif /* _ASM_ARM_XEN_HYPERVISOR_H */ diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h new file mode 100644 index 00000000000..1109017499e --- /dev/null +++ b/arch/arm/include/asm/xen/page-coherent.h @@ -0,0 +1,50 @@ +#ifndef _ASM_ARM_XEN_PAGE_COHERENT_H +#define _ASM_ARM_XEN_PAGE_COHERENT_H + +#include <asm/page.h> +#include <linux/dma-attrs.h> +#include <linux/dma-mapping.h> + +static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) +{ + return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); +} + +static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); +} + +static inline void xen_dma_map_page(struct device *hwdev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); +} + +static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + if (__generic_dma_ops(hwdev)->unmap_page) + __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); +} + +static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) +{ + if (__generic_dma_ops(hwdev)->sync_single_for_cpu) + __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); +} + +static inline void xen_dma_sync_single_for_device(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) +{ + if (__generic_dma_ops(hwdev)->sync_single_for_device) + __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); +} +#endif /* _ASM_ARM_XEN_PAGE_COHERENT_H */ diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 359a7b50b15..75579a9d6f7 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -6,12 +6,12 @@ #include <linux/pfn.h> #include <linux/types.h> +#include <linux/dma-mapping.h> +#include <xen/xen.h> #include <xen/interface/grant_table.h> -#define pfn_to_mfn(pfn) (pfn) #define phys_to_machine_mapping_valid(pfn) (1) -#define mfn_to_pfn(mfn) (mfn) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) #define pte_mfn pte_pfn @@ -32,6 +32,38 @@ typedef struct xpaddr { #define INVALID_P2M_ENTRY (~0UL) +unsigned long __pfn_to_mfn(unsigned long pfn); +unsigned long __mfn_to_pfn(unsigned long mfn); +extern struct rb_root phys_to_mach; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + unsigned long mfn; + + if (phys_to_mach.rb_node != NULL) { + mfn = __pfn_to_mfn(pfn); + if (mfn != INVALID_P2M_ENTRY) + return mfn; + } + + return pfn; +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (phys_to_mach.rb_node != NULL) { + pfn = __mfn_to_pfn(mfn); + if (pfn != INVALID_P2M_ENTRY) + return pfn; + } + + return mfn; +} + +#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn) + static inline xmaddr_t phys_to_machine(xpaddr_t phys) { unsigned offset = phys.paddr & ~PAGE_MASK; @@ -76,11 +108,9 @@ static inline int m2p_remove_override(struct page *page, bool clear_pte) return 0; } -static inline bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; -} +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn, + unsigned long nr_pages); static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index c1ee007523d..c498b60c050 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -63,7 +63,8 @@ struct kvm_regs { /* Supported Processor Types */ #define KVM_ARM_TARGET_CORTEX_A15 0 -#define KVM_ARM_NUM_TARGETS 1 +#define KVM_ARM_TARGET_CORTEX_A7 1 +#define KVM_ARM_NUM_TARGETS 2 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index ebf5015508b..466bd299b1a 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool "Kernel-based Virtual Machine (KVM) support" select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_ARM_HOST depends on ARM_VIRT_EXT && ARM_LPAE diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index d99bee4950e..789bca9e64a 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -19,6 +19,6 @@ kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o -obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o +obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index aea7ccb8d39..2a700e00528 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -152,12 +152,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } @@ -797,6 +798,19 @@ long kvm_arch_vm_ioctl(struct file *filp, return -EFAULT; return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); } + case KVM_ARM_PREFERRED_TARGET: { + int err; + struct kvm_vcpu_init init; + + err = kvm_vcpu_preferred_target(&init); + if (err) + return err; + + if (copy_to_user(argp, &init, sizeof(init))) + return -EFAULT; + + return 0; + } default: return -EINVAL; } diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index db9cf692d4d..78c0885d650 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -71,6 +71,98 @@ int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) +{ + /* + * Compute guest MPIDR. We build a virtual cluster out of the + * vcpu_id, but we read the 'U' bit from the underlying + * hardware directly. + */ + vcpu->arch.cp15[c0_MPIDR] = ((read_cpuid_mpidr() & MPIDR_SMP_BITMASK) | + ((vcpu->vcpu_id >> 2) << MPIDR_LEVEL_BITS) | + (vcpu->vcpu_id & 3)); +} + +/* TRM entries A7:4.3.31 A15:4.3.28 - RO WI */ +static bool access_actlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c1_ACTLR]; + return true; +} + +/* TRM entries A7:4.3.56, A15:4.3.60 - R/O. */ +static bool access_cbar(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p); + return read_zero(vcpu, p); +} + +/* TRM entries A7:4.3.49, A15:4.3.48 - R/O WI */ +static bool access_l2ctlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c9_L2CTLR]; + return true; +} + +static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) +{ + u32 l2ctlr, ncores; + + asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr)); + l2ctlr &= ~(3 << 24); + ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1; + /* How many cores in the current cluster and the next ones */ + ncores -= (vcpu->vcpu_id & ~3); + /* Cap it to the maximum number of cores in a single cluster */ + ncores = min(ncores, 3U); + l2ctlr |= (ncores & 3) << 24; + + vcpu->arch.cp15[c9_L2CTLR] = l2ctlr; +} + +static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) +{ + u32 actlr; + + /* ACTLR contains SMP bit: make sure you create all cpus first! */ + asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr)); + /* Make the SMP bit consistent with the guest configuration */ + if (atomic_read(&vcpu->kvm->online_vcpus) > 1) + actlr |= 1U << 6; + else + actlr &= ~(1U << 6); + + vcpu->arch.cp15[c1_ACTLR] = actlr; +} + +/* + * TRM entries: A7:4.3.50, A15:4.3.49 + * R/O WI (even if NSACR.NS_L2ERR, a write of 1 is ignored). + */ +static bool access_l2ectlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = 0; + return true; +} + /* See note at ARM ARM B1.14.4 */ static bool access_dcsw(struct kvm_vcpu *vcpu, const struct coproc_params *p, @@ -153,10 +245,22 @@ static bool pm_fake(struct kvm_vcpu *vcpu, * registers preceding 32-bit ones. */ static const struct coproc_reg cp15_regs[] = { + /* MPIDR: we use VMPIDR for guest access. */ + { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32, + NULL, reset_mpidr, c0_MPIDR }, + /* CSSELR: swapped by interrupt.S. */ { CRn( 0), CRm( 0), Op1( 2), Op2( 0), is32, NULL, reset_unknown, c0_CSSELR }, + /* ACTLR: trapped by HCR.TAC bit. */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32, + access_actlr, reset_actlr, c1_ACTLR }, + + /* CPACR: swapped by interrupt.S. */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32, + NULL, reset_val, c1_CPACR, 0x00000000 }, + /* TTBR0/TTBR1: swapped by interrupt.S. */ { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 }, { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 }, @@ -195,6 +299,13 @@ static const struct coproc_reg cp15_regs[] = { { CRn( 7), CRm(10), Op1( 0), Op2( 2), is32, access_dcsw}, { CRn( 7), CRm(14), Op1( 0), Op2( 2), is32, access_dcsw}, /* + * L2CTLR access (guest wants to know #CPUs). + */ + { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32, + access_l2ctlr, reset_l2ctlr, c9_L2CTLR }, + { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, access_l2ectlr}, + + /* * Dummy performance monitor implementation. */ { CRn( 9), CRm(12), Op1( 0), Op2( 0), is32, access_pmcr}, @@ -234,6 +345,9 @@ static const struct coproc_reg cp15_regs[] = { /* CNTKCTL: swapped by interrupt.S. */ { CRn(14), CRm( 1), Op1( 0), Op2( 0), is32, NULL, reset_val, c14_CNTKCTL, 0x00000000 }, + + /* The Configuration Base Address Register. */ + { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar}, }; /* Target specific emulation tables */ @@ -241,6 +355,12 @@ static struct kvm_coproc_target_table *target_tables[KVM_ARM_NUM_TARGETS]; void kvm_register_target_coproc_table(struct kvm_coproc_target_table *table) { + unsigned int i; + + for (i = 1; i < table->num; i++) + BUG_ON(cmp_reg(&table->table[i-1], + &table->table[i]) >= 0); + target_tables[table->target] = table; } diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c index cf93472b9dd..bb0cac1410c 100644 --- a/arch/arm/kvm/coproc_a15.c +++ b/arch/arm/kvm/coproc_a15.c @@ -17,101 +17,12 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include <linux/kvm_host.h> -#include <asm/cputype.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_host.h> -#include <asm/kvm_emulate.h> #include <asm/kvm_coproc.h> +#include <asm/kvm_emulate.h> #include <linux/init.h> -static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - /* - * Compute guest MPIDR: - * (Even if we present only one VCPU to the guest on an SMP - * host we don't set the U bit in the MPIDR, or vice versa, as - * revealing the underlying hardware properties is likely to - * be the best choice). - */ - vcpu->arch.cp15[c0_MPIDR] = (read_cpuid_mpidr() & ~MPIDR_LEVEL_MASK) - | (vcpu->vcpu_id & MPIDR_LEVEL_MASK); -} - #include "coproc.h" -/* A15 TRM 4.3.28: RO WI */ -static bool access_actlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c1_ACTLR]; - return true; -} - -/* A15 TRM 4.3.60: R/O. */ -static bool access_cbar(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return write_to_read_only(vcpu, p); - return read_zero(vcpu, p); -} - -/* A15 TRM 4.3.48: R/O WI. */ -static bool access_l2ctlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c9_L2CTLR]; - return true; -} - -static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - u32 l2ctlr, ncores; - - asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr)); - l2ctlr &= ~(3 << 24); - ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1; - l2ctlr |= (ncores & 3) << 24; - - vcpu->arch.cp15[c9_L2CTLR] = l2ctlr; -} - -static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - u32 actlr; - - /* ACTLR contains SMP bit: make sure you create all cpus first! */ - asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr)); - /* Make the SMP bit consistent with the guest configuration */ - if (atomic_read(&vcpu->kvm->online_vcpus) > 1) - actlr |= 1U << 6; - else - actlr &= ~(1U << 6); - - vcpu->arch.cp15[c1_ACTLR] = actlr; -} - -/* A15 TRM 4.3.49: R/O WI (even if NSACR.NS_L2ERR, a write of 1 is ignored). */ -static bool access_l2ectlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = 0; - return true; -} - /* * A15-specific CP15 registers. * CRn denotes the primary register number, but is copied to the CRm in the @@ -121,29 +32,9 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu, * registers preceding 32-bit ones. */ static const struct coproc_reg a15_regs[] = { - /* MPIDR: we use VMPIDR for guest access. */ - { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32, - NULL, reset_mpidr, c0_MPIDR }, - /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, NULL, reset_val, c1_SCTLR, 0x00C50078 }, - /* ACTLR: trapped by HCR.TAC bit. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32, - access_actlr, reset_actlr, c1_ACTLR }, - /* CPACR: swapped by interrupt.S. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32, - NULL, reset_val, c1_CPACR, 0x00000000 }, - - /* - * L2CTLR access (guest wants to know #CPUs). - */ - { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32, - access_l2ctlr, reset_l2ctlr, c9_L2CTLR }, - { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, access_l2ectlr}, - - /* The Configuration Base Address Register. */ - { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar}, }; static struct kvm_coproc_target_table a15_target_table = { @@ -154,12 +45,6 @@ static struct kvm_coproc_target_table a15_target_table = { static int __init coproc_a15_init(void) { - unsigned int i; - - for (i = 1; i < ARRAY_SIZE(a15_regs); i++) - BUG_ON(cmp_reg(&a15_regs[i-1], - &a15_regs[i]) >= 0); - kvm_register_target_coproc_table(&a15_target_table); return 0; } diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c new file mode 100644 index 00000000000..1df76733158 --- /dev/null +++ b/arch/arm/kvm/coproc_a7.c @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Copyright (C) 2013 - ARM Ltd + * + * Authors: Rusty Russell <rusty@rustcorp.au> + * Christoffer Dall <c.dall@virtualopensystems.com> + * Jonathan Austin <jonathan.austin@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include <linux/kvm_host.h> +#include <asm/kvm_coproc.h> +#include <asm/kvm_emulate.h> +#include <linux/init.h> + +#include "coproc.h" + +/* + * Cortex-A7 specific CP15 registers. + * CRn denotes the primary register number, but is copied to the CRm in the + * user space API for 64-bit register access in line with the terminology used + * in the ARM ARM. + * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit + * registers preceding 32-bit ones. + */ +static const struct coproc_reg a7_regs[] = { + /* SCTLR: swapped by interrupt.S. */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, + NULL, reset_val, c1_SCTLR, 0x00C50878 }, +}; + +static struct kvm_coproc_target_table a7_target_table = { + .target = KVM_ARM_TARGET_CORTEX_A7, + .table = a7_regs, + .num = ARRAY_SIZE(a7_regs), +}; + +static int __init coproc_a7_init(void) +{ + kvm_register_target_coproc_table(&a7_target_table); + return 0; +} +late_initcall(coproc_a7_init); diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index bdede9e7da5..d6c00528367 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -354,7 +354,7 @@ static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr) *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; if (is_pabt) { - /* Set DFAR and DFSR */ + /* Set IFAR and IFSR */ vcpu->arch.cp15[c6_IFAR] = addr; is_lpae = (vcpu->arch.cp15[c2_TTBCR] >> 31); /* Always give debug fault for now - should give guest a clue */ diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 152d0361218..20f8d97904a 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -190,6 +190,8 @@ int __attribute_const__ kvm_target_cpu(void) return -EINVAL; switch (part_number) { + case ARM_CPU_PART_CORTEX_A7: + return KVM_ARM_TARGET_CORTEX_A7; case ARM_CPU_PART_CORTEX_A15: return KVM_ARM_TARGET_CORTEX_A15; default: @@ -202,7 +204,7 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, { unsigned int i; - /* We can only do a cortex A15 for now. */ + /* We can only cope with guest==host and only on A15/A7 (for now). */ if (init->target != kvm_target_cpu()) return -EINVAL; @@ -222,6 +224,26 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, return kvm_reset_vcpu(vcpu); } +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) +{ + int target = kvm_target_cpu(); + + if (target < 0) + return -ENODEV; + + memset(init, 0, sizeof(*init)); + + /* + * For now, we don't return any features. + * In future, we might use features to return target + * specific features available for the preferred + * target type. + */ + init->target = (__u32)target; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index df4c82d47ad..a92079011a8 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -73,23 +73,29 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) } /** - * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest + * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests * @vcpu: the vcpu pointer * @run: the kvm_run structure pointer * - * Simply sets the wait_for_interrupts flag on the vcpu structure, which will - * halt execution of world-switches and schedule other host processes until - * there is an incoming IRQ or FIQ to the VM. + * WFE: Yield the CPU and come back to this vcpu when the scheduler + * decides to. + * WFI: Simply call kvm_vcpu_block(), which will halt execution of + * world-switches and schedule other host processes until there is an + * incoming IRQ or FIQ to the VM. */ -static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { trace_kvm_wfi(*vcpu_pc(vcpu)); - kvm_vcpu_block(vcpu); + if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) + kvm_vcpu_on_spin(vcpu); + else + kvm_vcpu_block(vcpu); + return 1; } static exit_handle_fn arm_exit_handlers[] = { - [HSR_EC_WFI] = kvm_handle_wfi, + [HSR_EC_WFI] = kvm_handle_wfx, [HSR_EC_CP15_32] = kvm_handle_cp15_32, [HSR_EC_CP15_64] = kvm_handle_cp15_64, [HSR_EC_CP14_MR] = kvm_handle_cp14_access, diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 0c25d9487d5..4cb5a93182e 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -23,6 +23,68 @@ #include "trace.h" +static void mmio_write_buf(char *buf, unsigned int len, unsigned long data) +{ + void *datap = NULL; + union { + u8 byte; + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + tmp.byte = data; + datap = &tmp.byte; + break; + case 2: + tmp.hword = data; + datap = &tmp.hword; + break; + case 4: + tmp.word = data; + datap = &tmp.word; + break; + case 8: + tmp.dword = data; + datap = &tmp.dword; + break; + } + + memcpy(buf, datap, len); +} + +static unsigned long mmio_read_buf(char *buf, unsigned int len) +{ + unsigned long data = 0; + union { + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + data = buf[0]; + break; + case 2: + memcpy(&tmp.hword, buf, len); + data = tmp.hword; + break; + case 4: + memcpy(&tmp.word, buf, len); + data = tmp.word; + break; + case 8: + memcpy(&tmp.dword, buf, len); + data = tmp.dword; + break; + } + + return data; +} + /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * @vcpu: The VCPU pointer @@ -33,28 +95,27 @@ */ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) { - unsigned long *dest; + unsigned long data; unsigned int len; int mask; if (!run->mmio.is_write) { - dest = vcpu_reg(vcpu, vcpu->arch.mmio_decode.rt); - *dest = 0; - len = run->mmio.len; if (len > sizeof(unsigned long)) return -EINVAL; - memcpy(dest, run->mmio.data, len); - - trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, - *((u64 *)run->mmio.data)); + data = mmio_read_buf(run->mmio.data, len); if (vcpu->arch.mmio_decode.sign_extend && len < sizeof(unsigned long)) { mask = 1U << ((len * 8) - 1); - *dest = (*dest ^ mask) - mask; + data = (data ^ mask) - mask; } + + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, + data); + data = vcpu_data_host_to_guest(vcpu, data, len); + *vcpu_reg(vcpu, vcpu->arch.mmio_decode.rt) = data; } return 0; @@ -105,6 +166,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa) { struct kvm_exit_mmio mmio; + unsigned long data; unsigned long rt; int ret; @@ -125,13 +187,15 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, } rt = vcpu->arch.mmio_decode.rt; + data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), mmio.len); + trace_kvm_mmio((mmio.is_write) ? KVM_TRACE_MMIO_WRITE : KVM_TRACE_MMIO_READ_UNSATISFIED, mmio.len, fault_ipa, - (mmio.is_write) ? *vcpu_reg(vcpu, rt) : 0); + (mmio.is_write) ? data : 0); if (mmio.is_write) - memcpy(mmio.data, vcpu_reg(vcpu, rt), mmio.len); + mmio_write_buf(mmio.data, mmio.len, data); if (vgic_handle_mmio(vcpu, run, &mmio)) return 1; diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b0de86b56c1..371958370de 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -19,6 +19,7 @@ #include <linux/mman.h> #include <linux/kvm_host.h> #include <linux/io.h> +#include <linux/hugetlb.h> #include <trace/events/kvm.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> @@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; +#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) + static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { /* @@ -93,19 +96,29 @@ static bool page_empty(void *ptr) static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table = pmd_offset(pud, 0); - pud_clear(pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); + if (pud_huge(*pud)) { + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + pmd_t *pmd_table = pmd_offset(pud, 0); + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pmd_free(NULL, pmd_table); + } put_page(virt_to_page(pud)); } static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { - pte_t *pte_table = pte_offset_kernel(pmd, 0); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pte_free_kernel(NULL, pte_table); + if (kvm_pmd_huge(*pmd)) { + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + pte_t *pte_table = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pte_free_kernel(NULL, pte_table); + } put_page(virt_to_page(pmd)); } @@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, continue; } + if (pud_huge(*pud)) { + /* + * If we are dealing with a huge pud, just clear it and + * move on. + */ + clear_pud_entry(kvm, pud, addr); + addr = pud_addr_end(addr, end); + continue; + } + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { addr = pmd_addr_end(addr, end); continue; } - pte = pte_offset_kernel(pmd, addr); - clear_pte_entry(kvm, pte, addr); - next = addr + PAGE_SIZE; + if (!kvm_pmd_huge(*pmd)) { + pte = pte_offset_kernel(pmd, addr); + clear_pte_entry(kvm, pte, addr); + next = addr + PAGE_SIZE; + } - /* If we emptied the pte, walk back up the ladder */ - if (page_empty(pte)) { + /* + * If the pmd entry is to be cleared, walk back up the ladder + */ + if (kvm_pmd_huge(*pmd) || page_empty(pte)) { clear_pmd_entry(kvm, pmd, addr); next = pmd_addr_end(addr, end); if (page_empty(pmd) && !page_empty(pud)) { @@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm) kvm->arch.pgd = NULL; } - -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte, old_pte; - /* Create 2nd stage page table mapping - Level 1 */ pgd = kvm->arch.pgd + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ + return NULL; pmd = mmu_memory_cache_alloc(cache); pud_populate(NULL, pud, pmd); get_page(virt_to_page(pud)); } - pmd = pmd_offset(pud, addr); + return pmd_offset(pud, addr); +} + +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache + *cache, phys_addr_t addr, const pmd_t *new_pmd) +{ + pmd_t *pmd, old_pmd; + + pmd = stage2_get_pmd(kvm, cache, addr); + VM_BUG_ON(!pmd); + + /* + * Mapping in huge pages should only happen through a fault. If a + * page is merged into a transparent huge page, the individual + * subpages of that huge page should be unmapped through MMU + * notifiers before we get here. + * + * Merging of CompoundPages is not supported; they should become + * splitting first, unmapped, merged, and mapped back in on-demand. + */ + VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); + + old_pmd = *pmd; + kvm_set_pmd(pmd, *new_pmd); + if (pmd_present(old_pmd)) + kvm_tlb_flush_vmid_ipa(kvm, addr); + else + get_page(virt_to_page(pmd)); + return 0; +} + +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pte_t *new_pte, bool iomap) +{ + pmd_t *pmd; + pte_t *pte, old_pte; - /* Create 2nd stage page table mapping - Level 2 */ + /* Create stage-2 page table mapping - Level 1 */ + pmd = stage2_get_pmd(kvm, cache, addr); + if (!pmd) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ @@ -507,16 +576,60 @@ out: return ret; } +static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) +{ + pfn_t pfn = *pfnp; + gfn_t gfn = *ipap >> PAGE_SHIFT; + + if (PageTransCompound(pfn_to_page(pfn))) { + unsigned long mask; + /* + * The address we faulted on is backed by a transparent huge + * page. However, because we map the compound huge page and + * not the individual tail page, we need to transfer the + * refcount to the head page. We have to be careful that the + * THP doesn't start to split while we are adjusting the + * refcounts. + * + * We are sure this doesn't happen, because mmu_notifier_retry + * was successful and we are holding the mmu_lock, so if this + * THP is trying to split, it will be blocked in the mmu + * notifier before touching any of the pages, specifically + * before being able to call __split_huge_page_refcount(). + * + * We can therefore safely transfer the refcount from PG_tail + * to PG_head and switch the pfn from a tail page to the head + * page accordingly. + */ + mask = PTRS_PER_PMD - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + *ipap &= PMD_MASK; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); + *pfnp = pfn; + } + + return true; + } + + return false; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - gfn_t gfn, struct kvm_memory_slot *memslot, + struct kvm_memory_slot *memslot, unsigned long fault_status) { - pte_t new_pte; - pfn_t pfn; int ret; - bool write_fault, writable; + bool write_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; + gfn_t gfn = fault_ipa >> PAGE_SHIFT; + unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); + struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + struct vm_area_struct *vma; + pfn_t pfn; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM && !write_fault) { @@ -524,6 +637,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + /* Let's check if we will get back a huge page backed by hugetlbfs */ + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, hva, hva + 1); + if (is_vm_hugetlb_page(vma)) { + hugetlb = true; + gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; + } else { + /* + * Pages belonging to VMAs not aligned to the PMD mapping + * granularity cannot be mapped using block descriptors even + * if the pages belong to a THP for the process, because the + * stage-2 block descriptor will cover more than a single THP + * and we loose atomicity for unmapping, updates, and splits + * of the THP or other pages in the stage-2 block range. + */ + if (vma->vm_start & ~PMD_MASK) + force_pte = true; + } + up_read(¤t->mm->mmap_sem); + /* We need minimum second+third level pages */ ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); if (ret) @@ -541,26 +674,40 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); if (is_error_pfn(pfn)) return -EFAULT; - new_pte = pfn_pte(pfn, PAGE_S2); - coherent_icache_guest_page(vcpu->kvm, gfn); - - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (writable) { - kvm_set_s2pte_writable(&new_pte); - kvm_set_pfn_dirty(pfn); + if (!hugetlb && !force_pte) + hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); + + if (hugetlb) { + pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); + new_pmd = pmd_mkhuge(new_pmd); + if (writable) { + kvm_set_s2pmd_writable(&new_pmd); + kvm_set_pfn_dirty(pfn); + } + coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); + } else { + pte_t new_pte = pfn_pte(pfn, PAGE_S2); + if (writable) { + kvm_set_s2pte_writable(&new_pte); + kvm_set_pfn_dirty(pfn); + } + coherent_icache_guest_page(kvm, hva, PAGE_SIZE); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); } - stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); + out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + spin_unlock(&kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return ret; } /** @@ -629,7 +776,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) memslot = gfn_to_memslot(vcpu->kvm, gfn); - ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); if (ret == 0) ret = 1; out_unlock: diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 86a693a02ba..0881bf169fb 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -18,6 +18,7 @@ #include <linux/kvm_host.h> #include <linux/wait.h> +#include <asm/cputype.h> #include <asm/kvm_emulate.h> #include <asm/kvm_psci.h> @@ -34,22 +35,30 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu = NULL, *tmp; wait_queue_head_t *wq; unsigned long cpu_id; + unsigned long mpidr; phys_addr_t target_pc; + int i; cpu_id = *vcpu_reg(source_vcpu, 1); if (vcpu_mode_is_32bit(source_vcpu)) cpu_id &= ~((u32) 0); - if (cpu_id >= atomic_read(&kvm->online_vcpus)) + kvm_for_each_vcpu(i, tmp, kvm) { + mpidr = kvm_vcpu_get_mpidr(tmp); + if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) { + vcpu = tmp; + break; + } + } + + if (!vcpu) return KVM_PSCI_RET_INVAL; target_pc = *vcpu_reg(source_vcpu, 2); - vcpu = kvm_get_vcpu(kvm, cpu_id); - wq = kvm_arch_vcpu_wq(vcpu); if (!waitqueue_active(wq)) return KVM_PSCI_RET_INVAL; @@ -62,6 +71,10 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) vcpu_set_thumb(vcpu); } + /* Propagate caller endianness */ + if (kvm_vcpu_is_be(source_vcpu)) + kvm_vcpu_set_be(vcpu); + *vcpu_pc(vcpu) = target_pc; vcpu->arch.pause = false; smp_mb(); /* Make sure the above is visible */ diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index c02ba4af599..f558c073c02 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -30,16 +30,14 @@ #include <kvm/arm_arch_timer.h> /****************************************************************************** - * Cortex-A15 Reset Values + * Cortex-A15 and Cortex-A7 Reset Values */ -static const int a15_max_cpu_idx = 3; - -static struct kvm_regs a15_regs_reset = { +static struct kvm_regs cortexa_regs_reset = { .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, }; -static const struct kvm_irq_level a15_vtimer_irq = { +static const struct kvm_irq_level cortexa_vtimer_irq = { { .irq = 27 }, .level = 1, }; @@ -62,12 +60,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) const struct kvm_irq_level *cpu_vtimer_irq; switch (vcpu->arch.target) { + case KVM_ARM_TARGET_CORTEX_A7: case KVM_ARM_TARGET_CORTEX_A15: - if (vcpu->vcpu_id > a15_max_cpu_idx) - return -EINVAL; - reset_regs = &a15_regs_reset; + reset_regs = &cortexa_regs_reset; vcpu->arch.midr = read_cpuid_id(); - cpu_vtimer_irq = &a15_vtimer_irq; + cpu_vtimer_irq = &cortexa_vtimer_irq; break; default: return -ENODEV; diff --git a/arch/arm/mach-tegra/apbio.c b/arch/arm/mach-tegra/apbio.c index d7aa52ea6cf..bc471973cf0 100644 --- a/arch/arm/mach-tegra/apbio.c +++ b/arch/arm/mach-tegra/apbio.c @@ -114,7 +114,7 @@ static int do_dma_transfer(unsigned long apb_add, dma_desc->callback = apb_dma_complete; dma_desc->callback_param = NULL; - INIT_COMPLETION(tegra_apb_wait); + reinit_completion(&tegra_apb_wait); dmaengine_submit(dma_desc); dma_async_issue_pending(tegra_apb_dma_chan); diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 2a5907b5c8d..ff379ac115d 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -65,7 +65,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, return ret; } -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS /* * If we are using split PTE locks, then we need to take the page * lock here. Otherwise we are using shared mm->page_table_lock @@ -84,10 +84,10 @@ static inline void do_pte_unlock(spinlock_t *ptl) { spin_unlock(ptl); } -#else /* !USE_SPLIT_PTLOCKS */ +#else /* !USE_SPLIT_PTE_PTLOCKS */ static inline void do_pte_lock(spinlock_t *ptl) {} static inline void do_pte_unlock(spinlock_t *ptl) {} -#endif /* USE_SPLIT_PTLOCKS */ +#endif /* USE_SPLIT_PTE_PTLOCKS */ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, unsigned long pfn) diff --git a/arch/arm/xen/Makefile b/arch/arm/xen/Makefile index 43841033afd..12969523414 100644 --- a/arch/arm/xen/Makefile +++ b/arch/arm/xen/Makefile @@ -1 +1 @@ -obj-y := enlighten.o hypercall.o grant-table.o +obj-y := enlighten.o hypercall.o grant-table.o p2m.o mm.o diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c new file mode 100644 index 00000000000..b0e77de9914 --- /dev/null +++ b/arch/arm/xen/mm.c @@ -0,0 +1,65 @@ +#include <linux/bootmem.h> +#include <linux/gfp.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/dma-mapping.h> +#include <linux/vmalloc.h> +#include <linux/swiotlb.h> + +#include <xen/xen.h> +#include <xen/interface/memory.h> +#include <xen/swiotlb-xen.h> + +#include <asm/cacheflush.h> +#include <asm/xen/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/interface.h> + +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle) +{ + if (!xen_initial_domain()) + return -EINVAL; + + /* we assume that dom0 is mapped 1:1 for now */ + *dma_handle = pstart; + return 0; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) +{ + return; +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + +struct dma_map_ops *xen_dma_ops; +EXPORT_SYMBOL_GPL(xen_dma_ops); + +static struct dma_map_ops xen_swiotlb_dma_ops = { + .mapping_error = xen_swiotlb_dma_mapping_error, + .alloc = xen_swiotlb_alloc_coherent, + .free = xen_swiotlb_free_coherent, + .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, + .sync_single_for_device = xen_swiotlb_sync_single_for_device, + .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, + .map_sg = xen_swiotlb_map_sg_attrs, + .unmap_sg = xen_swiotlb_unmap_sg_attrs, + .map_page = xen_swiotlb_map_page, + .unmap_page = xen_swiotlb_unmap_page, + .dma_supported = xen_swiotlb_dma_supported, + .set_dma_mask = xen_swiotlb_set_dma_mask, +}; + +int __init xen_mm_init(void) +{ + if (!xen_initial_domain()) + return 0; + xen_swiotlb_init(1, false); + xen_dma_ops = &xen_swiotlb_dma_ops; + return 0; +} +arch_initcall(xen_mm_init); diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c new file mode 100644 index 00000000000..23732cdff55 --- /dev/null +++ b/arch/arm/xen/p2m.c @@ -0,0 +1,208 @@ +#include <linux/bootmem.h> +#include <linux/gfp.h> +#include <linux/export.h> +#include <linux/rwlock.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/dma-mapping.h> +#include <linux/vmalloc.h> +#include <linux/swiotlb.h> + +#include <xen/xen.h> +#include <xen/interface/memory.h> +#include <xen/swiotlb-xen.h> + +#include <asm/cacheflush.h> +#include <asm/xen/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/interface.h> + +struct xen_p2m_entry { + unsigned long pfn; + unsigned long mfn; + unsigned long nr_pages; + struct rb_node rbnode_mach; + struct rb_node rbnode_phys; +}; + +rwlock_t p2m_lock; +struct rb_root phys_to_mach = RB_ROOT; +static struct rb_root mach_to_phys = RB_ROOT; + +static int xen_add_phys_to_mach_entry(struct xen_p2m_entry *new) +{ + struct rb_node **link = &phys_to_mach.rb_node; + struct rb_node *parent = NULL; + struct xen_p2m_entry *entry; + int rc = 0; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct xen_p2m_entry, rbnode_phys); + + if (new->mfn == entry->mfn) + goto err_out; + if (new->pfn == entry->pfn) + goto err_out; + + if (new->pfn < entry->pfn) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + rb_link_node(&new->rbnode_phys, parent, link); + rb_insert_color(&new->rbnode_phys, &phys_to_mach); + goto out; + +err_out: + rc = -EINVAL; + pr_warn("%s: cannot add pfn=%pa -> mfn=%pa: pfn=%pa -> mfn=%pa already exists\n", + __func__, &new->pfn, &new->mfn, &entry->pfn, &entry->mfn); +out: + return rc; +} + +unsigned long __pfn_to_mfn(unsigned long pfn) +{ + struct rb_node *n = phys_to_mach.rb_node; + struct xen_p2m_entry *entry; + unsigned long irqflags; + + read_lock_irqsave(&p2m_lock, irqflags); + while (n) { + entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys); + if (entry->pfn <= pfn && + entry->pfn + entry->nr_pages > pfn) { + read_unlock_irqrestore(&p2m_lock, irqflags); + return entry->mfn + (pfn - entry->pfn); + } + if (pfn < entry->pfn) + n = n->rb_left; + else + n = n->rb_right; + } + read_unlock_irqrestore(&p2m_lock, irqflags); + + return INVALID_P2M_ENTRY; +} +EXPORT_SYMBOL_GPL(__pfn_to_mfn); + +static int xen_add_mach_to_phys_entry(struct xen_p2m_entry *new) +{ + struct rb_node **link = &mach_to_phys.rb_node; + struct rb_node *parent = NULL; + struct xen_p2m_entry *entry; + int rc = 0; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct xen_p2m_entry, rbnode_mach); + + if (new->mfn == entry->mfn) + goto err_out; + if (new->pfn == entry->pfn) + goto err_out; + + if (new->mfn < entry->mfn) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + rb_link_node(&new->rbnode_mach, parent, link); + rb_insert_color(&new->rbnode_mach, &mach_to_phys); + goto out; + +err_out: + rc = -EINVAL; + pr_warn("%s: cannot add pfn=%pa -> mfn=%pa: pfn=%pa -> mfn=%pa already exists\n", + __func__, &new->pfn, &new->mfn, &entry->pfn, &entry->mfn); +out: + return rc; +} + +unsigned long __mfn_to_pfn(unsigned long mfn) +{ + struct rb_node *n = mach_to_phys.rb_node; + struct xen_p2m_entry *entry; + unsigned long irqflags; + + read_lock_irqsave(&p2m_lock, irqflags); + while (n) { + entry = rb_entry(n, struct xen_p2m_entry, rbnode_mach); + if (entry->mfn <= mfn && + entry->mfn + entry->nr_pages > mfn) { + read_unlock_irqrestore(&p2m_lock, irqflags); + return entry->pfn + (mfn - entry->mfn); + } + if (mfn < entry->mfn) + n = n->rb_left; + else + n = n->rb_right; + } + read_unlock_irqrestore(&p2m_lock, irqflags); + + return INVALID_P2M_ENTRY; +} +EXPORT_SYMBOL_GPL(__mfn_to_pfn); + +bool __set_phys_to_machine_multi(unsigned long pfn, + unsigned long mfn, unsigned long nr_pages) +{ + int rc; + unsigned long irqflags; + struct xen_p2m_entry *p2m_entry; + struct rb_node *n = phys_to_mach.rb_node; + + if (mfn == INVALID_P2M_ENTRY) { + write_lock_irqsave(&p2m_lock, irqflags); + while (n) { + p2m_entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys); + if (p2m_entry->pfn <= pfn && + p2m_entry->pfn + p2m_entry->nr_pages > pfn) { + rb_erase(&p2m_entry->rbnode_mach, &mach_to_phys); + rb_erase(&p2m_entry->rbnode_phys, &phys_to_mach); + write_unlock_irqrestore(&p2m_lock, irqflags); + kfree(p2m_entry); + return true; + } + if (pfn < p2m_entry->pfn) + n = n->rb_left; + else + n = n->rb_right; + } + write_unlock_irqrestore(&p2m_lock, irqflags); + return true; + } + + p2m_entry = kzalloc(sizeof(struct xen_p2m_entry), GFP_NOWAIT); + if (!p2m_entry) { + pr_warn("cannot allocate xen_p2m_entry\n"); + return false; + } + p2m_entry->pfn = pfn; + p2m_entry->nr_pages = nr_pages; + p2m_entry->mfn = mfn; + + write_lock_irqsave(&p2m_lock, irqflags); + if ((rc = xen_add_phys_to_mach_entry(p2m_entry) < 0) || + (rc = xen_add_mach_to_phys_entry(p2m_entry) < 0)) { + write_unlock_irqrestore(&p2m_lock, irqflags); + return false; + } + write_unlock_irqrestore(&p2m_lock, irqflags); + return true; +} +EXPORT_SYMBOL_GPL(__set_phys_to_machine_multi); + +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + return __set_phys_to_machine_multi(pfn, mfn, 1); +} +EXPORT_SYMBOL_GPL(__set_phys_to_machine); + +int p2m_init(void) +{ + rwlock_init(&p2m_lock); + return 0; +} +arch_initcall(p2m_init); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bb0bf1bfc05..88c8b6c1341 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -143,7 +143,6 @@ config CPU_BIG_ENDIAN config SMP bool "Symmetric Multi-Processing" - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you say N here, the kernel will run on single and @@ -221,6 +220,7 @@ config XEN_DOM0 config XEN bool "Xen guest support on ARM64 (EXPERIMENTAL)" depends on ARM64 && OF + select SWIOTLB_XEN help Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64. diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index 8d1810001ae..fd0c0c0e447 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -23,11 +23,15 @@ #include <asm-generic/dma-coherent.h> +#include <xen/xen.h> +#include <asm/xen/hypervisor.h> + #define ARCH_HAS_DMA_GET_REQUIRED_MASK +#define DMA_ERROR_CODE (~(dma_addr_t)0) extern struct dma_map_ops *dma_ops; -static inline struct dma_map_ops *get_dma_ops(struct device *dev) +static inline struct dma_map_ops *__generic_dma_ops(struct device *dev) { if (unlikely(!dev) || !dev->archdata.dma_ops) return dma_ops; @@ -35,6 +39,14 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return dev->archdata.dma_ops; } +static inline struct dma_map_ops *get_dma_ops(struct device *dev) +{ + if (xen_initial_domain()) + return xen_dma_ops; + else + return __generic_dma_ops(dev); +} + #include <asm-generic/dma-mapping-common.h> static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index b56e5b5df88..4cc813eddac 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -22,11 +22,14 @@ #ifdef __KERNEL__ #include <linux/types.h> +#include <linux/blk_types.h> #include <asm/byteorder.h> #include <asm/barrier.h> #include <asm/pgtable.h> +#include <xen/xen.h> + /* * Generic IO read/write. These perform native-endian accesses. */ @@ -263,5 +266,12 @@ extern int devmem_is_allowed(unsigned long pfn); */ #define xlate_dev_kmem_ptr(p) p +struct bio_vec; +extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, + const struct bio_vec *vec2); +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ + (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ + (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) + #endif /* __KERNEL__ */ #endif /* __ASM_IO_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index a5f28e2720c..c98ef4771c7 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -63,6 +63,7 @@ * TAC: Trap ACTLR * TSC: Trap SMC * TSW: Trap cache operations by set/way + * TWE: Trap WFE * TWI: Trap WFI * TIDCP: Trap L2CTLR/L2ECTLR * BSU_IS: Upgrade barriers to the inner shareable domain @@ -72,8 +73,9 @@ * FMO: Override CPSR.F and enable signaling with VF * SWIO: Turn set/way invalidates into set/way clean+invalidate */ -#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ - HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ +#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ + HCR_BSU_IS | HCR_FB | HCR_TAC | \ + HCR_AMO | HCR_IMO | HCR_FMO | \ HCR_SWIO | HCR_TIDCP | HCR_RW) #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) @@ -242,4 +244,6 @@ #define ESR_EL2_EC_xABT_xFSR_EXTABT 0x10 +#define ESR_EL2_EC_WFI_ISS_WFE (1 << 0) + #endif /* __ARM64_KVM_ARM_H__ */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index eec07387521..dd8ecfc3f99 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -177,4 +177,65 @@ static inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & ESR_EL2_FSC_TYPE; } +static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) +{ + return vcpu_sys_reg(vcpu, MPIDR_EL1); +} + +static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) +{ + if (vcpu_mode_is_32bit(vcpu)) + *vcpu_cpsr(vcpu) |= COMPAT_PSR_E_BIT; + else + vcpu_sys_reg(vcpu, SCTLR_EL1) |= (1 << 25); +} + +static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) +{ + if (vcpu_mode_is_32bit(vcpu)) + return !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_E_BIT); + + return !!(vcpu_sys_reg(vcpu, SCTLR_EL1) & (1 << 25)); +} + +static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return be16_to_cpu(data & 0xffff); + case 4: + return be32_to_cpu(data & 0xffffffff); + default: + return be64_to_cpu(data); + } + } + + return data; /* Leave LE untouched */ +} + +static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return cpu_to_be16(data & 0xffff); + case 4: + return cpu_to_be32(data & 0xffffffff); + default: + return cpu_to_be64(data); + } + } + + return data; /* Leave LE untouched */ +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0859a4ddd1e..5d85a02d123 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -36,11 +36,6 @@ #define KVM_VCPU_MAX_FEATURES 2 -/* We don't currently support large pages. */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) - struct kvm_vcpu; int kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -151,6 +146,7 @@ struct kvm_vcpu_stat { struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); struct kvm_one_reg; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index efe609c6a3c..680f74e6749 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -91,6 +91,7 @@ int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) +#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) static inline bool kvm_is_write_fault(unsigned long esr) { @@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) pte_val(*pte) |= PTE_S2_RDWR; } +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +{ + pmd_val(*pmd) |= PMD_S2_RDWR; +} + struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, + unsigned long size) { if (!icache_is_aliasing()) { /* PIPT */ - unsigned long hva = gfn_to_hva(kvm, gfn); - flush_icache_range(hva, hva + PAGE_SIZE); + flush_icache_range(hva, hva + size); } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */ /* any kind of VIPT cache */ __flush_icache_all(); diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index f214069ec5d..9bea6e74a00 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -63,9 +63,12 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr) struct page *pte; pte = alloc_pages(PGALLOC_GFP, 0); - if (pte) - pgtable_page_ctor(pte); - + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d57e66845c8..755f8614332 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -85,6 +85,8 @@ #define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ #define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ + /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h new file mode 100644 index 00000000000..2820f1a6eeb --- /dev/null +++ b/arch/arm64/include/asm/xen/page-coherent.h @@ -0,0 +1,47 @@ +#ifndef _ASM_ARM64_XEN_PAGE_COHERENT_H +#define _ASM_ARM64_XEN_PAGE_COHERENT_H + +#include <asm/page.h> +#include <linux/dma-attrs.h> +#include <linux/dma-mapping.h> + +static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) +{ + return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); +} + +static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); +} + +static inline void xen_dma_map_page(struct device *hwdev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); +} + +static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); +} + +static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) +{ + __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); +} + +static inline void xen_dma_sync_single_for_device(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) +{ + __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); +} +#endif /* _ASM_ARM64_XEN_PAGE_COHERENT_H */ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 21e90820bd2..4480ab339a0 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM select MMU_NOTIFIER select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_ARM_HOST select KVM_ARM_VGIC diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2c3ff67a8ec..3f0731e5327 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -248,6 +248,26 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, return kvm_reset_vcpu(vcpu); } +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) +{ + int target = kvm_target_cpu(); + + if (target < 0) + return -ENODEV; + + memset(init, 0, sizeof(*init)); + + /* + * For now, we don't return any features. + * In future, we might use features to return target + * specific features available for the preferred + * target type. + */ + init->target = (__u32)target; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 9beaca03343..8da56067c30 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -47,21 +47,29 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) } /** - * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest + * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event + * instruction executed by a guest + * * @vcpu: the vcpu pointer * - * Simply call kvm_vcpu_block(), which will halt execution of + * WFE: Yield the CPU and come back to this vcpu when the scheduler + * decides to. + * WFI: Simply call kvm_vcpu_block(), which will halt execution of * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. */ -static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { - kvm_vcpu_block(vcpu); + if (kvm_vcpu_get_hsr(vcpu) & ESR_EL2_EC_WFI_ISS_WFE) + kvm_vcpu_on_spin(vcpu); + else + kvm_vcpu_block(vcpu); + return 1; } static exit_handle_fn arm_exit_handlers[] = { - [ESR_EL2_EC_WFI] = kvm_handle_wfi, + [ESR_EL2_EC_WFI] = kvm_handle_wfx, [ESR_EL2_EC_CP15_32] = kvm_handle_cp15_32, [ESR_EL2_EC_CP15_64] = kvm_handle_cp15_64, [ESR_EL2_EC_CP14_MR] = kvm_handle_cp14_access, diff --git a/arch/arm64/xen/Makefile b/arch/arm64/xen/Makefile index be240404ba9..74a8d87e542 100644 --- a/arch/arm64/xen/Makefile +++ b/arch/arm64/xen/Makefile @@ -1,2 +1,2 @@ -xen-arm-y += $(addprefix ../../arm/xen/, enlighten.o grant-table.o) +xen-arm-y += $(addprefix ../../arm/xen/, enlighten.o grant-table.o p2m.o mm.o) obj-y := xen-arm.o hypercall.o diff --git a/arch/avr32/include/asm/pgalloc.h b/arch/avr32/include/asm/pgalloc.h index bc7e8ae479e..1aba19d68c5 100644 --- a/arch/avr32/include/asm/pgalloc.h +++ b/arch/avr32/include/asm/pgalloc.h @@ -68,7 +68,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, return NULL; page = virt_to_page(pg); - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + quicklist_free(QUICK_PT, NULL, pg); + return NULL; + } return page; } diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index e887b57c317..9ceccef9c64 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -34,7 +34,6 @@ config BLACKFIN select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_ATOMIC64 select GENERIC_IRQ_PROBE - select USE_GENERIC_SMP_HELPERS if SMP select HAVE_NMI_WATCHDOG if NMI_WATCHDOG select GENERIC_SMP_IDLE_THREAD select ARCH_USES_GETTIMEOFFSET if !GENERIC_CLOCKEVENTS diff --git a/arch/cris/include/asm/pgalloc.h b/arch/cris/include/asm/pgalloc.h index 6da975db112..235ece437dd 100644 --- a/arch/cris/include/asm/pgalloc.h +++ b/arch/cris/include/asm/pgalloc.h @@ -32,7 +32,12 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addres { struct page *pte; pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c index f6084bc524e..41907d25ed3 100644 --- a/arch/frv/mm/pgalloc.c +++ b/arch/frv/mm/pgalloc.c @@ -37,11 +37,15 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) #else page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); #endif - if (page) { - clear_highpage(page); - pgtable_page_ctor(page); - flush_dcache_page(page); + if (!page) + return NULL; + + clear_highpage(page); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; } + flush_dcache_page(page); return page; } diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 99041b07e61..09df2608f40 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -4,7 +4,6 @@ comment "Linux Kernel Configuration for Hexagon" config HEXAGON def_bool y select HAVE_OPROFILE - select USE_GENERIC_SMP_HELPERS if SMP # Other pending projects/to-do items. # select HAVE_REGS_AND_STACK_ACCESS_API # select HAVE_HW_BREAKPOINT if PERF_EVENTS diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 679bf6d6648..4c9d382d779 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -65,10 +65,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, struct page *pte; pte = alloc_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); - - if (pte) - pgtable_page_ctor(pte); - + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 7740ab10a17..dfe85e92ca2 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -343,7 +343,6 @@ config FORCE_MAX_ZONEORDER config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, say N. If you have a system with more diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 989dd3fe8de..db95f570705 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -234,10 +234,6 @@ struct kvm_vm_data { #define KVM_REQ_PTC_G 32 #define KVM_REQ_RESUME 33 -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) 1 - struct kvm; struct kvm_vcpu; @@ -480,7 +476,7 @@ struct kvm_arch { struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; - int iommu_flags; + bool iommu_noncoherent; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index 96a8d927db2..5767cdfc08d 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -91,7 +91,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) if (!pg) return NULL; page = virt_to_page(pg); - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + quicklist_free(0, NULL, pg); + return NULL; + } return page; } diff --git a/arch/ia64/include/asm/xen/page-coherent.h b/arch/ia64/include/asm/xen/page-coherent.h new file mode 100644 index 00000000000..96e42f97fa1 --- /dev/null +++ b/arch/ia64/include/asm/xen/page-coherent.h @@ -0,0 +1,38 @@ +#ifndef _ASM_IA64_XEN_PAGE_COHERENT_H +#define _ASM_IA64_XEN_PAGE_COHERENT_H + +#include <asm/page.h> +#include <linux/dma-attrs.h> +#include <linux/dma-mapping.h> + +static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) +{ + void *vstart = (void*)__get_free_pages(flags, get_order(size)); + *dma_handle = virt_to_phys(vstart); + return vstart; +} + +static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +static inline void xen_dma_map_page(struct device *hwdev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { } + +static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { } + +static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) { } + +static inline void xen_dma_sync_single_for_device(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) { } + +#endif /* _ASM_IA64_XEN_PAGE_COHERENT_H */ diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index bdfd8789b37..985bf80c622 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1550,12 +1550,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 75661fbf452..09ef94a8a7c 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -275,7 +275,6 @@ source "kernel/Kconfig.preempt" config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/m32r/include/asm/pgalloc.h b/arch/m32r/include/asm/pgalloc.h index 0fc73619897..2d55a064cca 100644 --- a/arch/m32r/include/asm/pgalloc.h +++ b/arch/m32r/include/asm/pgalloc.h @@ -43,7 +43,12 @@ static __inline__ pgtable_t pte_alloc_one(struct mm_struct *mm, { struct page *pte = alloc_page(GFP_KERNEL|__GFP_ZERO); - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 313f3dd23cd..f9924fbcfe4 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -56,6 +56,10 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, if (!page) return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } pte = kmap(page); if (pte) { diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index 2f02f264e69..24bcba496c7 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -29,18 +29,22 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + struct page *page; pte_t *pte; + page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); if(!page) return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } pte = kmap(page); __flush_page_to_ram(pte); flush_tlb_kernel_page(pte); nocache_page(pte); kunmap(page); - pgtable_page_ctor(page); return page; } diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 48d80d5a666..f868506e335 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -59,7 +59,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, return NULL; clear_highpage(page); - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } return page; } diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index 36368eb07e1..e56abd2c1b4 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -111,7 +111,6 @@ config METAG_META21 config SMP bool "Symmetric multi-processing support" depends on METAG_META21 && METAG_META21_MMU - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one thread running Linux. If you have a system with only one thread running Linux, diff --git a/arch/metag/include/asm/pgalloc.h b/arch/metag/include/asm/pgalloc.h index 275d9285141..3104df0a482 100644 --- a/arch/metag/include/asm/pgalloc.h +++ b/arch/metag/include/asm/pgalloc.h @@ -52,8 +52,12 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, { struct page *pte; pte = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0); - if (pte) - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index ebd35792482..7fdf7fabc7d 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -122,8 +122,13 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, #endif ptepage = alloc_pages(flags, 0); - if (ptepage) - clear_highpage(ptepage); + if (!ptepage) + return NULL; + clear_highpage(ptepage); + if (!pgtable_page_ctor(ptepage)) { + __free_page(ptepage); + return NULL; + } return ptepage; } @@ -158,8 +163,9 @@ extern inline void pte_free_slow(struct page *ptepage) __free_page(ptepage); } -extern inline void pte_free(struct mm_struct *mm, struct page *ptepage) +static inline void pte_free(struct mm_struct *mm, struct page *ptepage) { + pgtable_page_dtor(ptepage); __free_page(ptepage); } diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 17cc7ff8458..867d7db1158 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2125,7 +2125,6 @@ source "mm/Kconfig" config SMP bool "Multi-Processing support" depends on SYS_SUPPORTS_SMP - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 4d6fa0bf130..32966969f2f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -27,13 +27,6 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 -/* Don't support huge pages */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 - -/* We don't currently support large pages. */ -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) 1 - /* Special address that contains the comm page, used for reducing # of traps */ diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 881d18b4e29..b336037e876 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -80,9 +80,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, struct page *pte; pte = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER); - if (pte) { - clear_highpage(pte); - pgtable_page_ctor(pte); + if (!pte) + return NULL; + clear_highpage(pte); + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; } return pte; } diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c index a7b044536de..73b34827826 100644 --- a/arch/mips/kvm/kvm_mips.c +++ b/arch/mips/kvm/kvm_mips.c @@ -198,12 +198,13 @@ kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return -ENOIOCTLCMD; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 6aaa1607001..8bde9237d13 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -181,7 +181,6 @@ endmenu config SMP bool "Symmetric multi-processing support" default y - select USE_GENERIC_SMP_HELPERS depends on MN10300_PROC_MN2WS0038 || MN10300_PROC_MN2WS0050 ---help--- This enables support for systems with more than one CPU. If you have diff --git a/arch/mn10300/include/asm/pgalloc.h b/arch/mn10300/include/asm/pgalloc.h index 146bacf193e..0f25d5fa86f 100644 --- a/arch/mn10300/include/asm/pgalloc.h +++ b/arch/mn10300/include/asm/pgalloc.h @@ -46,6 +46,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, struct page *pte) { + pgtable_page_dtor(pte); __free_page(pte); } diff --git a/arch/mn10300/mm/pgtable.c b/arch/mn10300/mm/pgtable.c index bd9ada693f9..e77a7c72808 100644 --- a/arch/mn10300/mm/pgtable.c +++ b/arch/mn10300/mm/pgtable.c @@ -78,8 +78,13 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) #else pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); #endif - if (pte) - clear_highpage(pte); + if (!pte) + return NULL; + clear_highpage(pte); + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 05c39ecd2ef..21484e5b9e9 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -78,8 +78,13 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, { struct page *pte; pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); - if (pte) - clear_page(page_address(pte)); + if (!pte) + return NULL; + clear_page(page_address(pte)); + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } @@ -90,6 +95,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, struct page *pte) { + pgtable_page_dtor(pte); __free_page(pte); } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 7dcde539d61..c03567a9a91 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -226,7 +226,6 @@ endchoice config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index fc987a1c12a..f213f5b4c42 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -121,8 +121,12 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (page) - pgtable_page_ctor(page); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } return page; } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2f898d63eb9..4740b0a15fa 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -106,7 +106,6 @@ config PPC select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG - select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE select HAVE_DEBUG_KMEMLEAK select GENERIC_ATOMIC64 if PPC32 diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 9b198d1b3b2..856f8deb557 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst) return inst & 0xffff; } +static inline unsigned int get_oc(u32 inst) +{ + return (inst >> 11) & 0x7fff; +} #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index cca12f08484..894662a5d4d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -198,12 +198,27 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) cmpwi r10,0; \ bne do_kvm_##n +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +/* + * If hv is possible, interrupts come into to the hv version + * of the kvmppc_interrupt code, which then jumps to the PR handler, + * kvmppc_interrupt_pr, if the guest is a PR guest. + */ +#define kvmppc_interrupt kvmppc_interrupt_hv +#else +#define kvmppc_interrupt kvmppc_interrupt_pr +#endif + #define __KVM_HANDLER(area, h, n) \ do_kvm_##n: \ BEGIN_FTR_SECTION_NESTED(947) \ ld r10,area+EX_CFAR(r13); \ std r10,HSTATE_CFAR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947); \ + BEGIN_FTR_SECTION_NESTED(948) \ + ld r10,area+EX_PPR(r13); \ + std r10,HSTATE_PPR(r13); \ + END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ ld r10,area+EX_R10(r13); \ stw r9,HSTATE_SCRATCH1(r13); \ ld r9,area+EX_R9(r13); \ @@ -217,6 +232,10 @@ do_kvm_##n: \ ld r10,area+EX_R10(r13); \ beq 89f; \ stw r9,HSTATE_SCRATCH1(r13); \ + BEGIN_FTR_SECTION_NESTED(948) \ + ld r9,area+EX_PPR(r13); \ + std r9,HSTATE_PPR(r13); \ + END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ ld r9,area+EX_R9(r13); \ std r12,HSTATE_SCRATCH0(r13); \ li r12,n; \ @@ -236,7 +255,7 @@ do_kvm_##n: \ #define KVM_HANDLER_SKIP(area, h, n) #endif -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE #define KVMTEST_PR(n) __KVMTEST(n) #define KVM_HANDLER_PR(area, h, n) __KVM_HANDLER(area, h, n) #define KVM_HANDLER_PR_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 851bac7afa4..1bd92fd43cf 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -123,6 +123,8 @@ #define BOOK3S_HFLAG_SLB 0x2 #define BOOK3S_HFLAG_PAIRED_SINGLE 0x4 #define BOOK3S_HFLAG_NATIVE_PS 0x8 +#define BOOK3S_HFLAG_MULTI_PGSIZE 0x10 +#define BOOK3S_HFLAG_NEW_TLBIE 0x20 #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ @@ -136,6 +138,8 @@ #define KVM_GUEST_MODE_NONE 0 #define KVM_GUEST_MODE_GUEST 1 #define KVM_GUEST_MODE_SKIP 2 +#define KVM_GUEST_MODE_GUEST_HV 3 +#define KVM_GUEST_MODE_HOST_HV 4 #define KVM_INST_FETCH_FAILED -1 diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index fa19e2f1a87..4a594b76674 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -58,16 +58,18 @@ struct hpte_cache { struct hlist_node list_pte_long; struct hlist_node list_vpte; struct hlist_node list_vpte_long; +#ifdef CONFIG_PPC_BOOK3S_64 + struct hlist_node list_vpte_64k; +#endif struct rcu_head rcu_head; u64 host_vpn; u64 pfn; ulong slot; struct kvmppc_pte pte; + int pagesize; }; struct kvmppc_vcpu_book3s { - struct kvm_vcpu vcpu; - struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct { u64 esid; @@ -99,6 +101,9 @@ struct kvmppc_vcpu_book3s { struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; +#ifdef CONFIG_PPC_BOOK3S_64 + struct hlist_head hpte_hash_vpte_64k[HPTEG_HASH_NUM_VPTE_64K]; +#endif int hpte_cache_count; spinlock_t mmu_lock; }; @@ -107,8 +112,9 @@ struct kvmppc_vcpu_book3s { #define CONTEXT_GUEST 1 #define CONTEXT_GUEST_END 2 -#define VSID_REAL 0x0fffffffffc00000ULL -#define VSID_BAT 0x0fffffffffb00000ULL +#define VSID_REAL 0x07ffffffffc00000ULL +#define VSID_BAT 0x07ffffffffb00000ULL +#define VSID_64K 0x0800000000000000ULL #define VSID_1T 0x1000000000000000ULL #define VSID_REAL_DR 0x2000000000000000ULL #define VSID_REAL_IR 0x4000000000000000ULL @@ -118,11 +124,12 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask) extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask); extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end); extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr); -extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr); extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); -extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); +extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte, + bool iswrite); +extern void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size); extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); @@ -134,6 +141,7 @@ extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); +extern void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte); extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu); extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte); @@ -151,7 +159,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper, u32 val); extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); -extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, + bool *writable); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, @@ -172,6 +181,8 @@ extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, unsigned long *hpret); extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); +extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, + unsigned long mask); extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); @@ -184,11 +195,9 @@ extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd); static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { - return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu); + return vcpu->arch.book3s; } -extern void kvm_return_point(void); - /* Also add subarch specific defines */ #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -198,203 +207,6 @@ extern void kvm_return_point(void); #include <asm/kvm_book3s_64.h> #endif -#ifdef CONFIG_KVM_BOOK3S_PR - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - return to_book3s(vcpu)->hior; -} - -static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, - unsigned long pending_now, unsigned long old_pending) -{ - if (pending_now) - vcpu->arch.shared->int_pending = 1; - else if (old_pending) - vcpu->arch.shared->int_pending = 0; -} - -static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) -{ - if ( num < 14 ) { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->gpr[num] = val; - svcpu_put(svcpu); - to_book3s(vcpu)->shadow_vcpu->gpr[num] = val; - } else - vcpu->arch.gpr[num] = val; -} - -static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) -{ - if ( num < 14 ) { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r = svcpu->gpr[num]; - svcpu_put(svcpu); - return r; - } else - return vcpu->arch.gpr[num]; -} - -static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->cr = val; - svcpu_put(svcpu); - to_book3s(vcpu)->shadow_vcpu->cr = val; -} - -static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - r = svcpu->cr; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->xer = val; - to_book3s(vcpu)->shadow_vcpu->xer = val; - svcpu_put(svcpu); -} - -static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - r = svcpu->xer; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->ctr = val; - svcpu_put(svcpu); -} - -static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->ctr; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->lr = val; - svcpu_put(svcpu); -} - -static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->lr; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->pc = val; - svcpu_put(svcpu); -} - -static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->pc; - svcpu_put(svcpu); - return r; -} - -static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) -{ - ulong pc = kvmppc_get_pc(vcpu); - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - - /* Load the instruction manually if it failed to do so in the - * exit path */ - if (svcpu->last_inst == KVM_INST_FETCH_FAILED) - kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false); - - r = svcpu->last_inst; - svcpu_put(svcpu); - return r; -} - -/* - * Like kvmppc_get_last_inst(), but for fetching a sc instruction. - * Because the sc instruction sets SRR0 to point to the following - * instruction, we have to fetch from pc - 4. - */ -static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu) -{ - ulong pc = kvmppc_get_pc(vcpu) - 4; - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - - /* Load the instruction manually if it failed to do so in the - * exit path */ - if (svcpu->last_inst == KVM_INST_FETCH_FAILED) - kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false); - - r = svcpu->last_inst; - svcpu_put(svcpu); - return r; -} - -static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->fault_dar; - svcpu_put(svcpu); - return r; -} - -static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) -{ - ulong crit_raw = vcpu->arch.shared->critical; - ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); - bool crit; - - /* Truncate crit indicators in 32 bit mode */ - if (!(vcpu->arch.shared->msr & MSR_SF)) { - crit_raw &= 0xffffffff; - crit_r1 &= 0xffffffff; - } - - /* Critical section when crit == r1 */ - crit = (crit_raw == crit_r1); - /* ... and we're in supervisor mode */ - crit = crit && !(vcpu->arch.shared->msr & MSR_PR); - - return crit; -} -#else /* CONFIG_KVM_BOOK3S_PR */ - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, - unsigned long pending_now, unsigned long old_pending) -{ -} - static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { vcpu->arch.gpr[num] = val; @@ -489,12 +301,6 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) return vcpu->arch.fault_dar; } -static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) -{ - return false; -} -#endif - /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/include/asm/kvm_book3s_32.h b/arch/powerpc/include/asm/kvm_book3s_32.h index ce0ef6ce8f8..c720e0b3238 100644 --- a/arch/powerpc/include/asm/kvm_book3s_32.h +++ b/arch/powerpc/include/asm/kvm_book3s_32.h @@ -22,7 +22,7 @@ static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { - return to_book3s(vcpu)->shadow_vcpu; + return vcpu->arch.shadow_vcpu; } static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 86d638a3b35..bf0fa8b0a88 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -20,7 +20,7 @@ #ifndef __ASM_KVM_BOOK3S_64_H__ #define __ASM_KVM_BOOK3S_64_H__ -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { preempt_disable(); @@ -35,7 +35,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #define SPAPR_TCE_SHIFT 12 -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ extern unsigned long kvm_rma_pages; #endif @@ -278,7 +278,7 @@ static inline int is_vrma_hpte(unsigned long hpte_v) (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); } -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* * Note modification of an HPTE; set the HPTE modified bit * if anyone is interested. @@ -289,6 +289,6 @@ static inline void note_hpte_modification(struct kvm *kvm, if (atomic_read(&kvm->arch.hpte_mod_interest)) rev->guest_rpte |= HPTE_GR_MODIFIED; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 9039d3c97ee..0bd9348a4db 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -83,7 +83,7 @@ struct kvmppc_host_state { u8 restore_hid5; u8 napping; -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE u8 hwthread_req; u8 hwthread_state; u8 host_ipi; @@ -101,6 +101,7 @@ struct kvmppc_host_state { #endif #ifdef CONFIG_PPC_BOOK3S_64 u64 cfar; + u64 ppr; #endif }; @@ -108,14 +109,14 @@ struct kvmppc_book3s_shadow_vcpu { ulong gpr[14]; u32 cr; u32 xer; - - u32 fault_dsisr; - u32 last_inst; ulong ctr; ulong lr; ulong pc; + ulong shadow_srr1; ulong fault_dar; + u32 fault_dsisr; + u32 last_inst; #ifdef CONFIG_PPC_BOOK3S_32 u32 sr[16]; /* Guest SRs */ diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index d3c1eb34c98..dd8f61510df 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -26,7 +26,12 @@ /* LPIDs we support with this build -- runtime limit may be lower */ #define KVMPPC_NR_LPIDS 64 -#define KVMPPC_INST_EHPRIV 0x7c00021c +#define KVMPPC_INST_EHPRIV 0x7c00021c +#define EHPRIV_OC_SHIFT 11 +/* "ehpriv 1" : ehpriv with OC = 1 is used for debug emulation */ +#define EHPRIV_OC_DEBUG 1 +#define KVMPPC_INST_EHPRIV_DEBUG (KVMPPC_INST_EHPRIV | \ + (EHPRIV_OC_DEBUG << EHPRIV_OC_SHIFT)) static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 33283532e9d..237d1d25b44 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -63,20 +63,17 @@ extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); #endif -/* We don't currently support large pages. */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 #define HPTEG_HASH_BITS_VPTE 13 #define HPTEG_HASH_BITS_VPTE_LONG 5 +#define HPTEG_HASH_BITS_VPTE_64K 11 #define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE) #define HPTEG_HASH_NUM_PTE_LONG (1 << HPTEG_HASH_BITS_PTE_LONG) #define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE) #define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG) +#define HPTEG_HASH_NUM_VPTE_64K (1 << HPTEG_HASH_BITS_VPTE_64K) /* Physical Address Mask - allowed range of real mode RAM access */ #define KVM_PAM 0x0fffffffffffffffULL @@ -89,6 +86,9 @@ struct lppaca; struct slb_shadow; struct dtl_entry; +struct kvmppc_vcpu_book3s; +struct kvmppc_book3s_shadow_vcpu; + struct kvm_vm_stat { u32 remote_tlb_flush; }; @@ -224,15 +224,15 @@ struct revmap_entry { #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch_memory_slot { -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned long *rmap; unsigned long *slot_phys; -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ }; struct kvm_arch { unsigned int lpid; -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned long hpt_virt; struct revmap_entry *revmap; unsigned int host_lpid; @@ -256,7 +256,10 @@ struct kvm_arch { cpumask_t need_tlb_flush; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; int hpt_cma_alloc; -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + struct mutex hpt_mutex; +#endif #ifdef CONFIG_PPC_BOOK3S_64 struct list_head spapr_tce_tables; struct list_head rtas_tokens; @@ -267,6 +270,7 @@ struct kvm_arch { #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; #endif + struct kvmppc_ops *kvm_ops; }; /* @@ -294,6 +298,10 @@ struct kvmppc_vcore { u64 stolen_tb; u64 preempt_tb; struct kvm_vcpu *runner; + u64 tb_offset; /* guest timebase - host timebase */ + ulong lpcr; + u32 arch_compat; + ulong pcr; }; #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) @@ -328,6 +336,7 @@ struct kvmppc_pte { bool may_read : 1; bool may_write : 1; bool may_execute : 1; + u8 page_size; /* MMU_PAGE_xxx */ }; struct kvmppc_mmu { @@ -340,7 +349,8 @@ struct kvmppc_mmu { /* book3s */ void (*mtsrin)(struct kvm_vcpu *vcpu, u32 srnum, ulong value); u32 (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum); - int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data); + int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data, bool iswrite); void (*reset_msr)(struct kvm_vcpu *vcpu); void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large); int (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); @@ -360,6 +370,7 @@ struct kvmppc_slb { bool large : 1; /* PTEs are 16MB */ bool tb : 1; /* 1TB segment */ bool class : 1; + u8 base_page_size; /* MMU_PAGE_xxx */ }; # ifdef CONFIG_PPC_FSL_BOOK3E @@ -377,17 +388,6 @@ struct kvmppc_slb { #define KVMPPC_EPR_USER 1 /* exit to userspace to fill EPR */ #define KVMPPC_EPR_KERNEL 2 /* in-kernel irqchip */ -struct kvmppc_booke_debug_reg { - u32 dbcr0; - u32 dbcr1; - u32 dbcr2; -#ifdef CONFIG_KVM_E500MC - u32 dbcr4; -#endif - u64 iac[KVMPPC_BOOKE_MAX_IAC]; - u64 dac[KVMPPC_BOOKE_MAX_DAC]; -}; - #define KVMPPC_IRQ_DEFAULT 0 #define KVMPPC_IRQ_MPIC 1 #define KVMPPC_IRQ_XICS 2 @@ -402,6 +402,10 @@ struct kvm_vcpu_arch { int slb_max; /* 1 + index of last valid entry in slb[] */ int slb_nr; /* total number of entries in SLB */ struct kvmppc_mmu mmu; + struct kvmppc_vcpu_book3s *book3s; +#endif +#ifdef CONFIG_PPC_BOOK3S_32 + struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; #endif ulong gpr[32]; @@ -463,6 +467,8 @@ struct kvm_vcpu_arch { u32 ctrl; ulong dabr; ulong cfar; + ulong ppr; + ulong shadow_srr1; #endif u32 vrsave; /* also USPRG0 */ u32 mmucr; @@ -498,6 +504,8 @@ struct kvm_vcpu_arch { u64 mmcr[3]; u32 pmc[8]; + u64 siar; + u64 sdar; #ifdef CONFIG_KVM_EXIT_TIMING struct mutex exit_timing_lock; @@ -531,7 +539,10 @@ struct kvm_vcpu_arch { u32 eptcfg; u32 epr; u32 crit_save; - struct kvmppc_booke_debug_reg dbg_reg; + /* guest debug registers*/ + struct debug_reg dbg_reg; + /* hardware visible debug registers when in guest state */ + struct debug_reg shadow_dbg_reg; #endif gpa_t paddr_accessed; gva_t vaddr_accessed; @@ -582,7 +593,7 @@ struct kvm_vcpu_arch { struct kvmppc_icp *icp; /* XICS presentation controller */ #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE struct kvm_vcpu_arch_shared shregs; unsigned long pgfault_addr; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index b15554a26c2..c8317fbf92c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -106,13 +106,6 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu); extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu); - -extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int op, int *advance); -extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, - ulong val); -extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, - ulong *val); extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu); extern int kvmppc_booke_init(void); @@ -135,17 +128,17 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); -extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, - struct kvm_allocate_rma *rma); extern struct kvm_rma_info *kvm_alloc_rma(void); extern void kvm_release_rma(struct kvm_rma_info *ri); extern struct page *kvm_alloc_hpt(unsigned long nr_pages); extern void kvm_release_hpt(struct page *page, unsigned long nr_pages); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); -extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free, +extern void kvmppc_core_free_memslot(struct kvm *kvm, + struct kvm_memory_slot *free, struct kvm_memory_slot *dont); -extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, +extern int kvmppc_core_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -177,6 +170,72 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq); extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq); +union kvmppc_one_reg { + u32 wval; + u64 dval; + vector128 vval; + u64 vsxval[2]; + struct { + u64 addr; + u64 length; + } vpaval; +}; + +struct kvmppc_ops { + struct module *owner; + int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); + int (*set_one_reg)(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); + int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id); + void (*vcpu_free)(struct kvm_vcpu *vcpu); + int (*check_requests)(struct kvm_vcpu *vcpu); + int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log); + void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot); + int (*prepare_memory_region)(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem); + void (*commit_memory_region)(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old); + int (*unmap_hva)(struct kvm *kvm, unsigned long hva); + int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, + unsigned long end); + int (*age_hva)(struct kvm *kvm, unsigned long hva); + int (*test_age_hva)(struct kvm *kvm, unsigned long hva); + void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); + void (*mmu_destroy)(struct kvm_vcpu *vcpu); + void (*free_memslot)(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); + int (*create_memslot)(struct kvm_memory_slot *slot, + unsigned long npages); + int (*init_vm)(struct kvm *kvm); + void (*destroy_vm)(struct kvm *kvm); + int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info); + int (*emulate_op)(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); + int (*emulate_mtspr)(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); + int (*emulate_mfspr)(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); + void (*fast_vcpu_kick)(struct kvm_vcpu *vcpu); + long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl, + unsigned long arg); + +}; + +extern struct kvmppc_ops *kvmppc_hv_ops; +extern struct kvmppc_ops *kvmppc_pr_ops; + +static inline bool is_kvmppc_hv_enabled(struct kvm *kvm) +{ + return kvm->arch.kvm_ops == kvmppc_hv_ops; +} + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. @@ -210,17 +269,6 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value) return r; } -union kvmppc_one_reg { - u32 wval; - u64 dval; - vector128 vval; - u64 vsxval[2]; - struct { - u64 addr; - u64 length; - } vpaval; -}; - #define one_reg_size(id) \ (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -245,10 +293,10 @@ union kvmppc_one_reg { __v; \ }) -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); -void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); @@ -260,7 +308,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); struct openpic; -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE extern void kvm_cma_reserve(void) __init; static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) { @@ -269,10 +317,10 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) static inline u32 kvmppc_get_xics_latch(void) { - u32 xirr = get_paca()->kvm_hstate.saved_xirr; + u32 xirr; + xirr = get_paca()->kvm_hstate.saved_xirr; get_paca()->kvm_hstate.saved_xirr = 0; - return xirr; } @@ -281,7 +329,10 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) paca[cpu].kvm_hstate.host_ipi = host_ipi; } -extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu); +static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->fast_vcpu_kick(vcpu); +} #else static inline void __init kvm_cma_reserve(void) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index a5954cebbc5..b6ea9e068c1 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -166,7 +166,7 @@ struct paca_struct { struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ #ifdef CONFIG_KVM_BOOK3S_HANDLER -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE /* We use this to store guest state in */ struct kvmppc_book3s_shadow_vcpu shadow_vcpu; #endif diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b09bd..16cb92d215d 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -91,7 +91,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, if (!pte) return NULL; page = virt_to_page(pte); - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } return page; } diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 7794b2b04eb..fc14a38c7cc 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -208,6 +208,7 @@ struct debug_reg { struct thread_struct { unsigned long ksp; /* Kernel stack pointer */ + #ifdef CONFIG_PPC64 unsigned long ksp_vsid; #endif @@ -221,6 +222,7 @@ struct thread_struct { void *pgdir; /* root of page-table tree */ unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */ #endif + /* Debug Registers */ struct debug_reg debug; struct thread_fp_state fp_state; struct thread_fp_state *fp_save_area; diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 0156702ba24..576ad88104c 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -40,7 +40,7 @@ #define _PAGE_U1 0x010000 #define _PAGE_U0 0x020000 #define _PAGE_ACCESSED 0x040000 -#define _PAGE_LENDIAN 0x080000 +#define _PAGE_ENDIAN 0x080000 #define _PAGE_GUARDED 0x100000 #define _PAGE_COHERENT 0x200000 /* M: enforce memory coherence */ #define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 126f6e98f84..5c45787d551 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -248,6 +248,7 @@ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_TBWL 0x11C /* Time Base Lower Register (super, R/W) */ #define SPRN_TBWU 0x11D /* Time Base Upper Register (super, R/W) */ +#define SPRN_TBU40 0x11E /* Timebase upper 40 bits (hyper, R/W) */ #define SPRN_SPURR 0x134 /* Scaled PURR */ #define SPRN_HSPRG0 0x130 /* Hypervisor Scratch 0 */ #define SPRN_HSPRG1 0x131 /* Hypervisor Scratch 1 */ @@ -288,6 +289,7 @@ #define LPCR_ISL (1ul << (63-2)) #define LPCR_VC_SH (63-2) #define LPCR_DPFD_SH (63-11) +#define LPCR_DPFD (7ul << LPCR_DPFD_SH) #define LPCR_VRMASD (0x1ful << (63-16)) #define LPCR_VRMA_L (1ul << (63-12)) #define LPCR_VRMA_LP0 (1ul << (63-15)) @@ -304,6 +306,7 @@ #define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ #define LPCR_MER 0x00000800 /* Mediated External Exception */ #define LPCR_MER_SH 11 +#define LPCR_TC 0x00000200 /* Translation control */ #define LPCR_LPES 0x0000000c #define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ #define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */ @@ -316,6 +319,10 @@ #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ #define SPRN_HMER 0x150 /* Hardware m? error recovery */ #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ +#define SPRN_PCR 0x152 /* Processor compatibility register */ +#define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ +#define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ +#define PCR_ARCH_205 0x2 /* Architecture 2.05 */ #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ #define SPRN_TLBINDEXR 0x154 /* P7 TLB control register */ #define SPRN_TLBVPNR 0x155 /* P7 TLB control register */ @@ -425,6 +432,7 @@ #define HID4_RMLS2_SH (63 - 2) /* Real mode limit bottom 2 bits */ #define HID4_LPID5_SH (63 - 6) /* partition ID bottom 4 bits */ #define HID4_RMOR_SH (63 - 22) /* real mode offset (16 bits) */ +#define HID4_RMOR (0xFFFFul << HID4_RMOR_SH) #define HID4_LPES1 (1 << (63-57)) /* LPAR env. sel. bit 1 */ #define HID4_RMLS0_SH (63 - 58) /* Real mode limit top bit */ #define HID4_LPID1_SH 0 /* partition ID top 2 bits */ @@ -1107,6 +1115,13 @@ #define PVR_BE 0x0070 #define PVR_PA6T 0x0090 +/* "Logical" PVR values defined in PAPR, representing architecture levels */ +#define PVR_ARCH_204 0x0f000001 +#define PVR_ARCH_205 0x0f000002 +#define PVR_ARCH_206 0x0f000003 +#define PVR_ARCH_206p 0x0f100003 +#define PVR_ARCH_207 0x0f000004 + /* Macros for setting and retrieving special purpose registers */ #ifndef __ASSEMBLY__ #define mfmsr() ({unsigned long rval; \ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 0fb1a6e9ff9..6836ec79a83 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -27,6 +27,7 @@ #define __KVM_HAVE_PPC_SMT #define __KVM_HAVE_IRQCHIP #define __KVM_HAVE_IRQ_LINE +#define __KVM_HAVE_GUEST_DEBUG struct kvm_regs { __u64 pc; @@ -269,7 +270,24 @@ struct kvm_fpu { __u64 fpr[32]; }; +/* + * Defines for h/w breakpoint, watchpoint (read, write or both) and + * software breakpoint. + * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status" + * for KVM_DEBUG_EXIT. + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) +#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) struct kvm_debug_exit_arch { + __u64 address; + /* + * exiting to userspace because of h/w breakpoint, watchpoint + * (read, write or both) and software breakpoint. + */ + __u32 status; + __u32 reserved; }; /* for KVM_SET_GUEST_DEBUG */ @@ -281,10 +299,6 @@ struct kvm_guest_debug_arch { * Type denotes h/w breakpoint, read watchpoint, write * watchpoint or watchpoint (both read and write). */ -#define KVMPPC_DEBUG_NONE 0x0 -#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) -#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) -#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) __u32 type; __u32 reserved; } bp[16]; @@ -429,6 +443,11 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10) #define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11) #define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12) +#define KVM_REG_PPC_MMCR2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13) +#define KVM_REG_PPC_MMCRS (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x14) +#define KVM_REG_PPC_SIAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x15) +#define KVM_REG_PPC_SDAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x16) +#define KVM_REG_PPC_SIER (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x17) #define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18) #define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19) @@ -499,6 +518,65 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a) #define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b) +/* Timebase offset */ +#define KVM_REG_PPC_TB_OFFSET (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9c) + +/* POWER8 registers */ +#define KVM_REG_PPC_SPMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9d) +#define KVM_REG_PPC_SPMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9e) +#define KVM_REG_PPC_IAMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9f) +#define KVM_REG_PPC_TFHAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa0) +#define KVM_REG_PPC_TFIAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa1) +#define KVM_REG_PPC_TEXASR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa2) +#define KVM_REG_PPC_FSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa3) +#define KVM_REG_PPC_PSPB (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xa4) +#define KVM_REG_PPC_EBBHR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa5) +#define KVM_REG_PPC_EBBRR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa6) +#define KVM_REG_PPC_BESCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7) +#define KVM_REG_PPC_TAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8) +#define KVM_REG_PPC_DPDES (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9) +#define KVM_REG_PPC_DAWR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa) +#define KVM_REG_PPC_DAWRX (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab) +#define KVM_REG_PPC_CIABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac) +#define KVM_REG_PPC_IC (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad) +#define KVM_REG_PPC_VTB (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae) +#define KVM_REG_PPC_CSIGR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaf) +#define KVM_REG_PPC_TACR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb0) +#define KVM_REG_PPC_TCSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1) +#define KVM_REG_PPC_PID (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2) +#define KVM_REG_PPC_ACOP (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3) + +#define KVM_REG_PPC_VRSAVE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4) +#define KVM_REG_PPC_LPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5) +#define KVM_REG_PPC_PPR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb6) + +/* Architecture compatibility level */ +#define KVM_REG_PPC_ARCH_COMPAT (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7) + +/* Transactional Memory checkpointed state: + * This is all GPRs, all VSX regs and a subset of SPRs + */ +#define KVM_REG_PPC_TM (KVM_REG_PPC | 0x80000000) +/* TM GPRs */ +#define KVM_REG_PPC_TM_GPR0 (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0) +#define KVM_REG_PPC_TM_GPR(n) (KVM_REG_PPC_TM_GPR0 + (n)) +#define KVM_REG_PPC_TM_GPR31 (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x1f) +/* TM VSX */ +#define KVM_REG_PPC_TM_VSR0 (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x20) +#define KVM_REG_PPC_TM_VSR(n) (KVM_REG_PPC_TM_VSR0 + (n)) +#define KVM_REG_PPC_TM_VSR63 (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x5f) +/* TM SPRS */ +#define KVM_REG_PPC_TM_CR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x60) +#define KVM_REG_PPC_TM_LR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x61) +#define KVM_REG_PPC_TM_CTR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x62) +#define KVM_REG_PPC_TM_FPSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x63) +#define KVM_REG_PPC_TM_AMR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x64) +#define KVM_REG_PPC_TM_PPR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x65) +#define KVM_REG_PPC_TM_VRSAVE (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x66) +#define KVM_REG_PPC_TM_VSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67) +#define KVM_REG_PPC_TM_DSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68) +#define KVM_REG_PPC_TM_TAR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69) + /* PPC64 eXternal Interrupt Controller Specification */ #define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e60a3697932..2ea5cc033ec 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -439,7 +439,7 @@ int main(void) DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr)); DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0)); DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1)); @@ -470,7 +470,7 @@ int main(void) DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); /* book3s */ -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); @@ -502,6 +502,8 @@ int main(void) DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); + DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar)); + DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar)); DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); @@ -511,18 +513,22 @@ int main(void) DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid)); DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar)); + DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr)); + DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); - DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - - offsetof(struct kvmppc_vcpu_book3s, vcpu)); + DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset)); + DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr)); + DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv)); DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb)); #ifdef CONFIG_PPC_BOOK3S_64 -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + DEFINE(PACA_SVCPU, offsetof(struct paca_struct, shadow_vcpu)); # define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f)) #else # define SVCPU_FIELD(x, f) @@ -574,7 +580,7 @@ int main(void) HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); HSTATE_FIELD(HSTATE_NAPPING, napping); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req); HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state); HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); @@ -590,10 +596,11 @@ int main(void) HSTATE_FIELD(HSTATE_DABR, dabr); HSTATE_FIELD(HSTATE_DECEXP, dec_expires); DEFINE(IPI_PRIORITY, IPI_PRIORITY); -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 HSTATE_FIELD(HSTATE_CFAR, cfar); + HSTATE_FIELD(HSTATE_PPR, ppr); #endif /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3a9ed6ac224..9f905e40922 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -126,7 +126,7 @@ BEGIN_FTR_SECTION bgt cr1,. GET_PACA(r13) -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL stb r0,HSTATE_HWTHREAD_STATE(r13) /* Order setting hwthread_state vs. testing hwthread_req */ @@ -425,7 +425,7 @@ data_access_check_stab: mfspr r9,SPRN_DSISR srdi r10,r10,60 rlwimi r10,r9,16,0x20 -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE lbz r9,HSTATE_IN_GUEST(r13) rlwimi r10,r9,8,0x300 #endif @@ -650,6 +650,32 @@ slb_miss_user_pseries: b . /* prevent spec. execution */ #endif /* __DISABLED__ */ +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +kvmppc_skip_interrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_SRR0 + addi r13, r13, 4 + mtspr SPRN_SRR0, r13 + GET_SCRATCH0(r13) + rfid + b . + +kvmppc_skip_Hinterrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_HSRR0 + addi r13, r13, 4 + mtspr SPRN_HSRR0, r13 + GET_SCRATCH0(r13) + hrfid + b . +#endif + /* * Code from here down to __end_handlers is invoked from the * exception prologs above. Because the prologs assemble the diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index e11863f4e59..847e40e62fc 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -84,7 +84,7 @@ _GLOBAL(power7_nap) std r9,_MSR(r1) std r1,PACAR1(r13) -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* Tell KVM we're napping */ li r4,KVM_HWTHREAD_IN_NAP stb r4,HSTATE_HWTHREAD_STATE(r13) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 62c3dd8c69f..907a472f9a9 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1529,7 +1529,7 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status) * back on or not. */ if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, - current->thread.debug.dbcr1)) + current->thread.debug.dbcr1)) regs->msr |= MSR_DE; else /* Make sure the IDM flag is off */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 2f5c6b6d687..93221e87b91 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -31,13 +31,13 @@ #include "44x_tlb.h" #include "booke.h" -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu) { kvmppc_booke_vcpu_load(vcpu, cpu); kvmppc_44x_tlb_load(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu) { kvmppc_44x_tlb_put(vcpu); kvmppc_booke_vcpu_put(vcpu); @@ -114,29 +114,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { return kvmppc_set_sregs_ivor(vcpu, sregs); } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { return -EINVAL; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { return -EINVAL; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; struct kvm_vcpu *vcpu; @@ -167,7 +170,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); @@ -176,28 +179,53 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_44x(struct kvm *kvm) { return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_44x(struct kvm *kvm) { } +static struct kvmppc_ops kvm_ops_44x = { + .get_sregs = kvmppc_core_get_sregs_44x, + .set_sregs = kvmppc_core_set_sregs_44x, + .get_one_reg = kvmppc_get_one_reg_44x, + .set_one_reg = kvmppc_set_one_reg_44x, + .vcpu_load = kvmppc_core_vcpu_load_44x, + .vcpu_put = kvmppc_core_vcpu_put_44x, + .vcpu_create = kvmppc_core_vcpu_create_44x, + .vcpu_free = kvmppc_core_vcpu_free_44x, + .mmu_destroy = kvmppc_mmu_destroy_44x, + .init_vm = kvmppc_core_init_vm_44x, + .destroy_vm = kvmppc_core_destroy_vm_44x, + .emulate_op = kvmppc_core_emulate_op_44x, + .emulate_mtspr = kvmppc_core_emulate_mtspr_44x, + .emulate_mfspr = kvmppc_core_emulate_mfspr_44x, +}; + static int __init kvmppc_44x_init(void) { int r; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; + + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_44x.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_44x; - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); +err_out: + return r; } static void __exit kvmppc_44x_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 35ec0a8547d..92c9ab4bcfe 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -91,8 +91,8 @@ static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) return EMULATE_DONE; } -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int dcrn = get_dcrn(inst); @@ -152,7 +152,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; @@ -172,7 +172,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index ed038544814..0deef1082e0 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -268,7 +268,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, trace_kvm_stlb_inval(stlb_index); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); int i; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index e593ff257bd..141b2027189 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -35,17 +35,20 @@ config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER -config KVM_BOOK3S_PR +config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO select MMU_NOTIFIER +config KVM_BOOK3S_HV_POSSIBLE + bool + config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT select KVM select KVM_BOOK3S_32_HANDLER - select KVM_BOOK3S_PR + select KVM_BOOK3S_PR_POSSIBLE ---help--- Support running unmodified book3s_32 guest kernels in virtual machines on book3s_32 host processors. @@ -60,6 +63,7 @@ config KVM_BOOK3S_64 depends on PPC_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM + select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. @@ -70,8 +74,9 @@ config KVM_BOOK3S_64 If unsure, say N. config KVM_BOOK3S_64_HV - bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" + tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + select KVM_BOOK3S_HV_POSSIBLE select MMU_NOTIFIER select CMA ---help--- @@ -90,9 +95,20 @@ config KVM_BOOK3S_64_HV If unsure, say N. config KVM_BOOK3S_64_PR - def_bool y - depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV - select KVM_BOOK3S_PR + tristate "KVM support without using hypervisor mode in host" + depends on KVM_BOOK3S_64 + select KVM_BOOK3S_PR_POSSIBLE + ---help--- + Support running guest kernels in virtual machines on processors + without using hypervisor mode in the host, by running the + guest in user mode (problem state) and emulating all + privileged instructions and registers. + + This is not as fast as using hypervisor mode, but works on + machines where hypervisor mode is not available or not usable, + and can emulate processors that are different from the host + processor, including emulating 32-bit processors on a 64-bit + host. config KVM_BOOKE_HV bool diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 6646c952c5e..ce569b6bf4d 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -53,41 +53,51 @@ kvm-e500mc-objs := \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ - $(KVM)/coalesced_mmio.o \ +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \ + book3s_64_vio_hv.o + +kvm-pr-y := \ fpu.o \ book3s_paired_singles.o \ book3s_pr.o \ book3s_pr_papr.o \ - book3s_64_vio_hv.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ book3s_64_mmu_host.o \ book3s_64_mmu.o \ book3s_32_mmu.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ + +ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +kvm-book3s_64-module-objs := \ + $(KVM)/coalesced_mmio.o + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_rmhandlers.o +endif -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ +kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o + kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ book3s_hv_rm_xics.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + +ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ - book3s_64_vio_hv.o \ book3s_hv_ras.o \ book3s_hv_builtin.o \ book3s_hv_cma.o \ $(kvm-book3s_64-builtin-xics-objs-y) +endif kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o -kvm-book3s_64-module-objs := \ +kvm-book3s_64-module-objs += \ $(KVM)/kvm_main.o \ $(KVM)/eventfd.o \ powerpc.o \ @@ -123,4 +133,7 @@ obj-$(CONFIG_KVM_E500MC) += kvm.o obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o +obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o + obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 700df6f1d32..8912608b7e1 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -34,6 +34,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> +#include "book3s.h" #include "trace.h" #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -69,6 +70,50 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) +{ + if (!is_kvmppc_hv_enabled(vcpu->kvm)) + return to_book3s(vcpu)->hior; + return 0; +} + +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, + unsigned long pending_now, unsigned long old_pending) +{ + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return; + if (pending_now) + vcpu->arch.shared->int_pending = 1; + else if (old_pending) + vcpu->arch.shared->int_pending = 0; +} + +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) +{ + ulong crit_raw; + ulong crit_r1; + bool crit; + + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return false; + + crit_raw = vcpu->arch.shared->critical; + crit_r1 = kvmppc_get_gpr(vcpu, 1); + + /* Truncate crit indicators in 32 bit mode */ + if (!(vcpu->arch.shared->msr & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(vcpu->arch.shared->msr & MSR_PR); + + return crit; +} + void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); @@ -126,28 +171,32 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) printk(KERN_INFO "Queueing interrupt %x\n", vec); #endif } - +EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { /* might as well deliver this straight away */ kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_program); void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec); int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) { return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); } +EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec); void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) @@ -285,8 +334,10 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) return 0; } +EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); -pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) +pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, + bool *writable) { ulong mp_pa = vcpu->arch.magic_page_pa; @@ -302,20 +353,23 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; get_page(pfn_to_page(pfn)); + if (writable) + *writable = true; return pfn; } - return gfn_to_pfn(vcpu->kvm, gfn); + return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable); } +EXPORT_SYMBOL_GPL(kvmppc_gfn_to_pfn); static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, - struct kvmppc_pte *pte) + bool iswrite, struct kvmppc_pte *pte) { int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR)); int r; if (relocated) { - r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); + r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite); } else { pte->eaddr = eaddr; pte->raddr = eaddr & KVM_PAM; @@ -361,7 +415,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.st++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, true, &pte)) return -ENOENT; *eaddr = pte.raddr; @@ -374,6 +428,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, return EMULATE_DONE; } +EXPORT_SYMBOL_GPL(kvmppc_st); int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data) @@ -383,7 +438,7 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.ld++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, false, &pte)) goto nopte; *eaddr = pte.raddr; @@ -404,6 +459,7 @@ nopte: mmio: return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_ld); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { @@ -419,6 +475,18 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) { } +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -495,8 +563,7 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) if (size > sizeof(val)) return -EINVAL; - r = kvmppc_get_one_reg(vcpu, reg->id, &val); - + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); if (r == -EINVAL) { r = 0; switch (reg->id) { @@ -528,6 +595,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) } val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); + break; #endif /* CONFIG_ALTIVEC */ case KVM_REG_PPC_DEBUG_INST: { u32 opcode = INS_TW; @@ -572,8 +642,7 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) return -EFAULT; - r = kvmppc_set_one_reg(vcpu, reg->id, &val); - + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); if (r == -EINVAL) { r = 0; switch (reg->id) { @@ -605,6 +674,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) } vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); break; + case KVM_REG_PPC_VRSAVE: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_KVM_XICS case KVM_REG_PPC_ICP_STATE: @@ -625,6 +701,27 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr); + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu); +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { @@ -644,3 +741,141 @@ void kvmppc_decrementer_func(unsigned long data) kvmppc_core_queue_dec(vcpu); kvm_vcpu_kick(vcpu); } + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->check_requests(vcpu); +} + +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return kvm->arch.kvm_ops->get_dirty_log(kvm, log); +} + +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + kvm->arch.kvm_ops->free_memslot(free, dont); +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvm->arch.kvm_ops->create_memslot(slot, npages); +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + kvm->arch.kvm_ops->flush_memslot(kvm, memslot); +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->unmap_hva(kvm, hva); +} +EXPORT_SYMBOL_GPL(kvm_unmap_hva); + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->age_hva(kvm, hva); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->test_age_hva(kvm, hva); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + +#ifdef CONFIG_PPC64 + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); +#endif + + return kvm->arch.kvm_ops->init_vm(kvm); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); + +#ifdef CONFIG_PPC64 + kvmppc_rtas_tokens_free(kvm); + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif +} + +int kvmppc_core_check_processor_compat(void) +{ + /* + * We always return 0 for book3s. We check + * for compatability while loading the HV + * or PR module + */ + return 0; +} + +static int kvmppc_book3s_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) + return r; +#ifdef CONFIG_KVM_BOOK3S_32 + r = kvmppc_book3s_init_pr(); +#endif + return r; + +} + +static void kvmppc_book3s_exit(void) +{ +#ifdef CONFIG_KVM_BOOK3S_32 + kvmppc_book3s_exit_pr(); +#endif + kvm_exit(); +} + +module_init(kvmppc_book3s_init); +module_exit(kvmppc_book3s_exit); diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h new file mode 100644 index 00000000000..4bf956cf94d --- /dev/null +++ b/arch/powerpc/kvm/book3s.h @@ -0,0 +1,34 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_BOOK3S_H__ +#define __POWERPC_KVM_BOOK3S_H__ + +extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot); +extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, + unsigned long end); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); + +extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong spr_val); +extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong *spr_val); +extern int kvmppc_book3s_init_pr(void); +extern void kvmppc_book3s_exit_pr(void); + +#endif diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index c8cefdd15fd..76a64ce6a5b 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -84,7 +84,8 @@ static inline bool sr_nx(u32 sr_raw) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data); + struct kvmppc_pte *pte, bool data, + bool iswrite); static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); @@ -99,7 +100,7 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, u64 vsid; struct kvmppc_pte pte; - if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data)) + if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false)) return pte.vpage; kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); @@ -111,10 +112,11 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, 0); } -static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu, u32 sre, gva_t eaddr, bool primary) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 page, hash, pteg, htabmask; hva_t r; @@ -132,7 +134,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3 kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg, sr_vsid(sre)); - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; return r | (pteg & ~PAGE_MASK); @@ -145,7 +147,8 @@ static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_bat *bat; @@ -186,8 +189,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, printk(KERN_INFO "BAT is not readable!\n"); continue; } - if (!pte->may_write) { - /* let's treat r/o BATs as not-readable for now */ + if (iswrite && !pte->may_write) { dprintk_pte("BAT is read-only!\n"); continue; } @@ -201,9 +203,8 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data, - bool primary) + bool iswrite, bool primary) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 sre; hva_t ptegp; u32 pteg[16]; @@ -218,7 +219,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); - ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary); + ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary); if (kvm_is_error_hva(ptegp)) { printk(KERN_INFO "KVM: Invalid PTEG!\n"); goto no_page_found; @@ -258,9 +259,6 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, break; } - if ( !pte->may_read ) - continue; - dprintk_pte("MMU: Found PTE -> %x %x - %x\n", pteg[i], pteg[i+1], pp); found = 1; @@ -271,19 +269,23 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, /* Update PTE C and A bits, so the guest's swapper knows we used the page */ if (found) { - u32 oldpte = pteg[i+1]; - - if (pte->may_read) - pteg[i+1] |= PTEG_FLAG_ACCESSED; - if (pte->may_write) - pteg[i+1] |= PTEG_FLAG_DIRTY; - else - dprintk_pte("KVM: Mapping read-only page!\n"); - - /* Write back into the PTEG */ - if (pteg[i+1] != oldpte) - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); - + u32 pte_r = pteg[i+1]; + char __user *addr = (char __user *) &pteg[i+1]; + + /* + * Use single-byte writes to update the HPTE, to + * conform to what real hardware does. + */ + if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) { + pte_r |= PTEG_FLAG_ACCESSED; + put_user(pte_r >> 8, addr + 2); + } + if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) { + pte_r |= PTEG_FLAG_DIRTY; + put_user(pte_r, addr + 3); + } + if (!pte->may_read || (iswrite && !pte->may_write)) + return -EPERM; return 0; } @@ -302,12 +304,14 @@ no_page_found: } static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { int r; ulong mp_ea = vcpu->arch.magic_page_ea; pte->eaddr = eaddr; + pte->page_size = MMU_PAGE_4K; /* Magic page override */ if (unlikely(mp_ea) && @@ -323,11 +327,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return 0; } - r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); + r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, true); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, false); return r; } @@ -347,7 +353,12 @@ static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large) { - kvmppc_mmu_pte_flush(vcpu, ea, 0x0FFFF000); + int i; + struct kvm_vcpu *v; + + /* flush this VA on all cpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000); } static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 00e619bf608..3a0abd2e5a1 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -138,7 +138,8 @@ static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr, extern char etext[]; -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { pfn_t hpaddr; u64 vpn; @@ -152,9 +153,11 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) bool evict = false; struct hpte_cache *pte; int r = 0; + bool writable; /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); + hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT, + iswrite, &writable); if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); @@ -204,7 +207,7 @@ next_pteg: (primary ? 0 : PTE_SEC); pteg1 = hpaddr | PTE_M | PTE_R | PTE_C; - if (orig_pte->may_write) { + if (orig_pte->may_write && writable) { pteg1 |= PP_RWRW; mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); } else { @@ -259,6 +262,11 @@ out: return r; } +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL); +} + static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { struct kvmppc_sid_map *map; @@ -341,7 +349,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 7e345e00661..83da1f868fd 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -107,9 +107,20 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, return kvmppc_slb_calc_vpn(slb, eaddr); } +static int mmu_pagesize(int mmu_pg) +{ + switch (mmu_pg) { + case MMU_PAGE_64K: + return 16; + case MMU_PAGE_16M: + return 24; + } + return 12; +} + static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) { - return slbe->large ? 24 : 12; + return mmu_pagesize(slbe->base_page_size); } static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) @@ -119,11 +130,11 @@ static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p); } -static hva_t kvmppc_mmu_book3s_64_get_pteg( - struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu, struct kvmppc_slb *slbe, gva_t eaddr, bool second) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u64 hash, pteg, htabsize; u32 ssize; hva_t r; @@ -148,10 +159,10 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg( /* When running a PAPR guest, SDR1 contains a HVA address instead of a GPA */ - if (vcpu_book3s->vcpu.arch.papr_enabled) + if (vcpu->arch.papr_enabled) r = pteg; else - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; @@ -166,18 +177,38 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr) avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p); - if (p < 24) - avpn >>= ((80 - p) - 56) - 8; + if (p < 16) + avpn >>= ((80 - p) - 56) - 8; /* 16 - p */ else - avpn <<= 8; + avpn <<= p - 16; return avpn; } +/* + * Return page size encoded in the second word of a HPTE, or + * -1 for an invalid encoding for the base page size indicated by + * the SLB entry. This doesn't handle mixed pagesize segments yet. + */ +static int decode_pagesize(struct kvmppc_slb *slbe, u64 r) +{ + switch (slbe->base_page_size) { + case MMU_PAGE_64K: + if ((r & 0xf000) == 0x1000) + return MMU_PAGE_64K; + break; + case MMU_PAGE_16M: + if ((r & 0xff000) == 0) + return MMU_PAGE_16M; + break; + } + return -1; +} + static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, + bool iswrite) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_slb *slbe; hva_t ptegp; u64 pteg[16]; @@ -189,6 +220,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, u8 pp, key = 0; bool found = false; bool second = false; + int pgsize; ulong mp_ea = vcpu->arch.magic_page_ea; /* Magic page override */ @@ -202,6 +234,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_execute = true; gpte->may_read = true; gpte->may_write = true; + gpte->page_size = MMU_PAGE_4K; return 0; } @@ -222,8 +255,12 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID | HPTE_V_SECONDARY; + pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K; + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + do_second: - ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); + ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second); if (kvm_is_error_hva(ptegp)) goto no_page_found; @@ -240,6 +277,13 @@ do_second: for (i=0; i<16; i+=2) { /* Check all relevant fields of 1st dword */ if ((pteg[i] & v_mask) == v_val) { + /* If large page bit is set, check pgsize encoding */ + if (slbe->large && + (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + pgsize = decode_pagesize(slbe, pteg[i+1]); + if (pgsize < 0) + continue; + } found = true; break; } @@ -256,13 +300,15 @@ do_second: v = pteg[i]; r = pteg[i+1]; pp = (r & HPTE_R_PP) | key; - eaddr_mask = 0xFFF; + if (r & HPTE_R_PP0) + pp |= 8; gpte->eaddr = eaddr; gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); - if (slbe->large) - eaddr_mask = 0xFFFFFF; + + eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1; gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask); + gpte->page_size = pgsize; gpte->may_execute = ((r & HPTE_R_N) ? false : true); gpte->may_read = false; gpte->may_write = false; @@ -277,6 +323,7 @@ do_second: case 3: case 5: case 7: + case 10: gpte->may_read = true; break; } @@ -287,30 +334,37 @@ do_second: /* Update PTE R and C bits, so the guest's swapper knows we used the * page */ - if (gpte->may_read) { - /* Set the accessed flag */ + if (gpte->may_read && !(r & HPTE_R_R)) { + /* + * Set the accessed flag. + * We have to write this back with a single byte write + * because another vcpu may be accessing this on + * non-PAPR platforms such as mac99, and this is + * what real hardware does. + */ + char __user *addr = (char __user *) &pteg[i+1]; r |= HPTE_R_R; + put_user(r >> 8, addr + 6); } - if (data && gpte->may_write) { - /* Set the dirty flag -- XXX even if not writing */ + if (iswrite && gpte->may_write && !(r & HPTE_R_C)) { + /* Set the dirty flag */ + /* Use a single byte write */ + char __user *addr = (char __user *) &pteg[i+1]; r |= HPTE_R_C; + put_user(r, addr + 7); } - /* Write back into the PTEG */ - if (pteg[i+1] != r) { - pteg[i+1] = r; - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); - } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); - if (!gpte->may_read) + if (!gpte->may_read || (iswrite && !gpte->may_write)) return -EPERM; return 0; no_page_found: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); return -ENOENT; no_seg_found: - dprintk("KVM MMU: Trigger segment fault\n"); return -EINVAL; } @@ -345,6 +399,21 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe->nx = (rs & SLB_VSID_N) ? 1 : 0; slbe->class = (rs & SLB_VSID_C) ? 1 : 0; + slbe->base_page_size = MMU_PAGE_4K; + if (slbe->large) { + if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) { + switch (rs & SLB_VSID_LP) { + case SLB_VSID_LP_00: + slbe->base_page_size = MMU_PAGE_16M; + break; + case SLB_VSID_LP_01: + slbe->base_page_size = MMU_PAGE_64K; + break; + } + } else + slbe->base_page_size = MMU_PAGE_16M; + } + slbe->orige = rb & (ESID_MASK | SLB_ESID_V); slbe->origv = rs; @@ -460,14 +529,45 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va, bool large) { u64 mask = 0xFFFFFFFFFULL; + long i; + struct kvm_vcpu *v; dprintk("KVM MMU: tlbie(0x%lx)\n", va); - if (large) - mask = 0xFFFFFF000ULL; - kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask); + /* + * The tlbie instruction changed behaviour starting with + * POWER6. POWER6 and later don't have the large page flag + * in the instruction but in the RB value, along with bits + * indicating page and segment sizes. + */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) { + /* POWER6 or later */ + if (va & 1) { /* L bit */ + if ((va & 0xf000) == 0x1000) + mask = 0xFFFFFFFF0ULL; /* 64k page */ + else + mask = 0xFFFFFF000ULL; /* 16M page */ + } + } else { + /* older processors, e.g. PPC970 */ + if (large) + mask = 0xFFFFFF000ULL; + } + /* flush this VA on all vcpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_vflush(v, va >> 12, mask); } +#ifdef CONFIG_PPC_64K_PAGES +static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid) +{ + ulong mp_ea = vcpu->arch.magic_page_ea; + + return mp_ea && !(vcpu->arch.shared->msr & MSR_PR) && + (mp_ea >> SID_SHIFT) == esid; +} +#endif + static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid) { @@ -475,11 +575,13 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, struct kvmppc_slb *slb; u64 gvsid = esid; ulong mp_ea = vcpu->arch.magic_page_ea; + int pagesize = MMU_PAGE_64K; if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); if (slb) { gvsid = slb->vsid; + pagesize = slb->base_page_size; if (slb->tb) { gvsid <<= SID_SHIFT_1T - SID_SHIFT; gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1); @@ -490,28 +592,41 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { case 0: - *vsid = VSID_REAL | esid; + gvsid = VSID_REAL | esid; break; case MSR_IR: - *vsid = VSID_REAL_IR | gvsid; + gvsid |= VSID_REAL_IR; break; case MSR_DR: - *vsid = VSID_REAL_DR | gvsid; + gvsid |= VSID_REAL_DR; break; case MSR_DR|MSR_IR: if (!slb) goto no_slb; - *vsid = gvsid; break; default: BUG(); break; } +#ifdef CONFIG_PPC_64K_PAGES + /* + * Mark this as a 64k segment if the host is using + * 64k pages, the host MMU supports 64k pages and + * the guest segment page size is >= 64k, + * but not if this segment contains the magic page. + */ + if (pagesize >= MMU_PAGE_64K && + mmu_psize_defs[MMU_PAGE_64K].shift && + !segment_contains_magic_page(vcpu, esid)) + gvsid |= VSID_64K; +#endif + if (vcpu->arch.shared->msr & MSR_PR) - *vsid |= VSID_PR; + gvsid |= VSID_PR; + *vsid = gvsid; return 0; no_slb: diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index e5240524bf6..0d513af62bb 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -27,14 +27,14 @@ #include <asm/machdep.h> #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, - MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M, + pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M, false); } @@ -78,7 +78,8 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) return NULL; } -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { unsigned long vpn; pfn_t hpaddr; @@ -90,16 +91,26 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) int attempt = 0; struct kvmppc_sid_map *map; int r = 0; + int hpsize = MMU_PAGE_4K; + bool writable; + unsigned long mmu_seq; + struct kvm *kvm = vcpu->kvm; + struct hpte_cache *cpte; + unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; + unsigned long pfn; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_noslot_pfn(hpaddr)) { - printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); + pfn = kvmppc_gfn_to_pfn(vcpu, gfn, iswrite, &writable); + if (is_error_noslot_pfn(pfn)) { + printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", gfn); r = -EINVAL; goto out; } - hpaddr <<= PAGE_SHIFT; - hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + hpaddr = pfn << PAGE_SHIFT; /* and write the mapping ea -> hpa into the pt */ vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); @@ -117,20 +128,39 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) goto out; } - vsid = map->host_vsid; - vpn = hpt_vpn(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); + vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); - if (!orig_pte->may_write) - rflags |= HPTE_R_PP; - else - mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); + kvm_set_pfn_accessed(pfn); + if (!orig_pte->may_write || !writable) + rflags |= PP_RXRX; + else { + mark_page_dirty(vcpu->kvm, gfn); + kvm_set_pfn_dirty(pfn); + } if (!orig_pte->may_execute) rflags |= HPTE_R_N; else - kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); + kvmppc_mmu_flush_icache(pfn); + + /* + * Use 64K pages if possible; otherwise, on 64K page kernels, + * we need to transfer 4 more bits from guest real to host real addr. + */ + if (vsid & VSID_64K) + hpsize = MMU_PAGE_64K; + else + hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + + hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); - hash = hpt_hash(vpn, PTE_SIZE, MMU_SEGSIZE_256M); + cpte = kvmppc_mmu_hpte_cache_next(vcpu); + + spin_lock(&kvm->mmu_lock); + if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { + r = -EAGAIN; + goto out_unlock; + } map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); @@ -139,11 +169,11 @@ map_again: if (attempt > 1) if (ppc_md.hpte_remove(hpteg) < 0) { r = -1; - goto out; + goto out_unlock; } ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, - MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M); + hpsize, hpsize, MMU_SEGSIZE_256M); if (ret < 0) { /* If we couldn't map a primary PTE, try a secondary */ @@ -152,8 +182,6 @@ map_again: attempt++; goto map_again; } else { - struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); - trace_kvm_book3s_64_mmu_map(rflags, hpteg, vpn, hpaddr, orig_pte); @@ -164,19 +192,37 @@ map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); } - pte->slot = hpteg + (ret & 7); - pte->host_vpn = vpn; - pte->pte = *orig_pte; - pte->pfn = hpaddr >> PAGE_SHIFT; + cpte->slot = hpteg + (ret & 7); + cpte->host_vpn = vpn; + cpte->pte = *orig_pte; + cpte->pfn = pfn; + cpte->pagesize = hpsize; - kvmppc_mmu_hpte_cache_map(vcpu, pte); + kvmppc_mmu_hpte_cache_map(vcpu, cpte); + cpte = NULL; } - kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + if (cpte) + kvmppc_mmu_hpte_cache_free(cpte); out: return r; } +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + u64 mask = 0xfffffffffULL; + u64 vsid; + + vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid); + if (vsid & VSID_64K) + mask = 0xffffffff0ULL; + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask); +} + static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { struct kvmppc_sid_map *map; @@ -291,6 +337,12 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) slb_vsid &= ~SLB_VSID_KP; slb_esid |= slb_index; +#ifdef CONFIG_PPC_64K_PAGES + /* Set host segment base page size to 64K if possible */ + if (gvsid & VSID_64K) + slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp; +#endif + svcpu->slb[slb_index].esid = slb_esid; svcpu->slb[slb_index].vsid = slb_vsid; @@ -326,7 +378,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { kvmppc_mmu_hpte_destroy(vcpu); __destroy_context(to_book3s(vcpu)->context_id[0]); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 043eec8461e..f3ff587a8b7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -260,10 +260,6 @@ int kvmppc_mmu_hv_init(void) return 0; } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) -{ -} - static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) { kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); @@ -451,7 +447,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, } static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, bool iswrite) { struct kvm *kvm = vcpu->kvm; struct kvmppc_slb *slbe; @@ -906,21 +902,22 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, return 0; } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) { if (kvm->arch.using_mmu_notifiers) kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { if (kvm->arch.using_mmu_notifiers) kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); return 0; } -void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot) { unsigned long *rmapp; unsigned long gfn; @@ -994,7 +991,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva) { if (!kvm->arch.using_mmu_notifiers) return 0; @@ -1032,14 +1029,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) { if (!kvm->arch.using_mmu_notifiers) return 0; return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) { if (!kvm->arch.using_mmu_notifiers) return; @@ -1512,9 +1509,8 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; - lpcr |= senc << (LPCR_VRMASD_SH - 4); - kvm->arch.lpcr = lpcr; + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); rma_setup = 1; } ++i; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 30c2f3b134c..2c25f5412bd 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -74,3 +74,4 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, /* Didn't find the liobn, punt it to userspace */ return H_TOO_HARD; } +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 360ce68c980..99d40f8977e 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -86,8 +86,8 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) return true; } -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int rt = get_rt(inst); @@ -172,7 +172,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } -#ifdef CONFIG_KVM_BOOK3S_64_PR +#ifdef CONFIG_PPC_BOOK3S_64 case OP_31_XOP_FAKE_SC1: { /* SC 1 papr hypercalls */ @@ -267,12 +267,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { - struct kvmppc_book3s_shadow_vcpu *svcpu; - - svcpu = svcpu_get(vcpu); *advance = 0; vcpu->arch.shared->dar = vaddr; - svcpu->fault_dar = vaddr; + vcpu->arch.fault_dar = vaddr; dsisr = DSISR_ISSTORE; if (r == -ENOENT) @@ -281,8 +278,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->dsisr = dsisr; - svcpu->fault_dsisr = dsisr; - svcpu_put(svcpu); + vcpu->arch.fault_dsisr = dsisr; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); @@ -349,7 +345,7 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn) return bat; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; @@ -472,7 +468,7 @@ unprivileged: return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 7057a02f090..852989a9bad 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -20,9 +20,10 @@ #include <linux/export.h> #include <asm/kvm_book3s.h> -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); -#else +#endif +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); #ifdef CONFIG_ALTIVEC diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 62a2b5ab08e..072287f1c3b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -52,6 +52,9 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/hugetlb.h> +#include <linux/module.h> + +#include "book3s.h" /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ @@ -66,7 +69,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); -void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) +static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { int me; int cpu = vcpu->cpu; @@ -125,7 +128,7 @@ void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) * purely defensive; they should never fail.) */ -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -143,7 +146,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) spin_unlock(&vcpu->arch.tbacct_lock); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -155,17 +158,46 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->arch.tbacct_lock); } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { vcpu->arch.shregs.msr = msr; kvmppc_end_cede(vcpu); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) { vcpu->arch.pvr = pvr; } +int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) +{ + unsigned long pcr = 0; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (arch_compat) { + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return -EINVAL; /* 970 has no compat mode support */ + + switch (arch_compat) { + case PVR_ARCH_205: + pcr = PCR_ARCH_205; + break; + case PVR_ARCH_206: + case PVR_ARCH_206p: + break; + default: + return -EINVAL; + } + } + + spin_lock(&vc->lock); + vc->arch_compat = arch_compat; + vc->pcr = pcr; + spin_unlock(&vc->lock); + + return 0; +} + void kvmppc_dump_regs(struct kvm_vcpu *vcpu) { int r; @@ -195,7 +227,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu) pr_err(" ESID = %.16llx VSID = %.16llx\n", vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", - vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1, + vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, vcpu->arch.last_inst); } @@ -489,7 +521,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, memset(dt, 0, sizeof(struct dtl_entry)); dt->dispatch_reason = 7; dt->processor_id = vc->pcpu + vcpu->arch.ptid; - dt->timebase = now; + dt->timebase = now + vc->tb_offset; dt->enqueue_to_dispatch_time = stolen; dt->srr0 = kvmppc_get_pc(vcpu); dt->srr1 = vcpu->arch.shregs.msr; @@ -538,6 +570,15 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } break; case H_CONFER: + target = kvmppc_get_gpr(vcpu, 4); + if (target == -1) + break; + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + kvm_vcpu_yield_to(tvcpu); break; case H_REGISTER_VPA: ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), @@ -576,8 +617,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - struct task_struct *tsk) +static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + struct task_struct *tsk) { int r = RESUME_HOST; @@ -671,16 +712,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", vcpu->arch.trap, kvmppc_get_pc(vcpu), vcpu->arch.shregs.msr); + run->hw.hardware_exit_reason = vcpu->arch.trap; r = RESUME_HOST; - BUG(); break; } return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i; @@ -694,12 +735,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i, j; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_hv(vcpu, sregs->pvr); j = 0; for (i = 0; i < vcpu->arch.slb_nr; i++) { @@ -714,7 +755,23 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + spin_lock(&vc->lock); + /* + * Userspace can only modify DPFD (default prefetch depth), + * ILE (interrupt little-endian) and TC (translation control). + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; + vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); + spin_unlock(&vc->lock); +} + +static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; long int i; @@ -749,6 +806,12 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; *val = get_reg_val(id, vcpu->arch.pmc[i]); break; + case KVM_REG_PPC_SIAR: + *val = get_reg_val(id, vcpu->arch.siar); + break; + case KVM_REG_PPC_SDAR: + *val = get_reg_val(id, vcpu->arch.sdar); + break; #ifdef CONFIG_VSX case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: if (cpu_has_feature(CPU_FTR_VSX)) { @@ -787,6 +850,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) val->vpaval.length = vcpu->arch.dtl.len; spin_unlock(&vcpu->arch.vpa_update_lock); break; + case KVM_REG_PPC_TB_OFFSET: + *val = get_reg_val(id, vcpu->arch.vcore->tb_offset); + break; + case KVM_REG_PPC_LPCR: + *val = get_reg_val(id, vcpu->arch.vcore->lpcr); + break; + case KVM_REG_PPC_PPR: + *val = get_reg_val(id, vcpu->arch.ppr); + break; + case KVM_REG_PPC_ARCH_COMPAT: + *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); + break; default: r = -EINVAL; break; @@ -795,7 +870,8 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; long int i; @@ -833,6 +909,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; vcpu->arch.pmc[i] = set_reg_val(id, *val); break; + case KVM_REG_PPC_SIAR: + vcpu->arch.siar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SDAR: + vcpu->arch.sdar = set_reg_val(id, *val); + break; #ifdef CONFIG_VSX case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: if (cpu_has_feature(CPU_FTR_VSX)) { @@ -880,6 +962,20 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) len -= len % sizeof(struct dtl_entry); r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; + case KVM_REG_PPC_TB_OFFSET: + /* round up to multiple of 2^24 */ + vcpu->arch.vcore->tb_offset = + ALIGN(set_reg_val(id, *val), 1UL << 24); + break; + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr(vcpu, set_reg_val(id, *val)); + break; + case KVM_REG_PPC_PPR: + vcpu->arch.ppr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_ARCH_COMPAT: + r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); + break; default: r = -EINVAL; break; @@ -888,14 +984,8 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_core_check_processor_compat(void) -{ - if (cpu_has_feature(CPU_FTR_HVMODE)) - return 0; - return -EIO; -} - -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, + unsigned int id) { struct kvm_vcpu *vcpu; int err = -EINVAL; @@ -919,8 +1009,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.mmcr[0] = MMCR0_FC; vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ - vcpu->arch.pvr = mfspr(SPRN_PVR); - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); spin_lock_init(&vcpu->arch.vpa_update_lock); spin_lock_init(&vcpu->arch.tbacct_lock); vcpu->arch.busy_preempt = TB_NIL; @@ -940,6 +1029,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) spin_lock_init(&vcore->lock); init_waitqueue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; + vcore->lpcr = kvm->arch.lpcr; } kvm->arch.vcores[core] = vcore; kvm->arch.online_vcores++; @@ -972,7 +1062,7 @@ static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) vpa->dirty); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->arch.vpa_update_lock); unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); @@ -983,6 +1073,12 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu); } +static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) +{ + /* Indicate we want to get back into the guest */ + return 1; +} + static void kvmppc_set_timer(struct kvm_vcpu *vcpu) { unsigned long dec_nsec, now; @@ -1264,8 +1360,8 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) ret = RESUME_GUEST; if (vcpu->arch.trap) - ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); vcpu->arch.ret = ret; vcpu->arch.trap = 0; @@ -1424,7 +1520,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } -int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; int srcu_idx; @@ -1546,7 +1642,8 @@ static const struct file_operations kvm_rma_fops = { .release = kvm_rma_release, }; -long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) +static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, + struct kvm_allocate_rma *ret) { long fd; struct kvm_rma_info *ri; @@ -1592,7 +1689,8 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, (*sps)++; } -int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { struct kvm_ppc_one_seg_page_size *sps; @@ -1613,7 +1711,8 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) /* * Get (and clear) the dirty memory log for a memory slot. */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, + struct kvm_dirty_log *log) { struct kvm_memory_slot *memslot; int r; @@ -1667,8 +1766,8 @@ static void unpin_slot(struct kvm_memory_slot *memslot) } } -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) { if (!dont || free->arch.rmap != dont->arch.rmap) { vfree(free->arch.rmap); @@ -1681,8 +1780,8 @@ void kvmppc_core_free_memslot(struct kvm_memory_slot *free, } } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) +static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, + unsigned long npages) { slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) @@ -1692,9 +1791,9 @@ int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, return 0; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) +static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) { unsigned long *phys; @@ -1710,9 +1809,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, return 0; } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) +static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; struct kvm_memory_slot *memslot; @@ -1729,6 +1828,37 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, } } +/* + * Update LPCR values in kvm->arch and in vcores. + * Caller must hold kvm->lock. + */ +void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) +{ + long int i; + u32 cores_done = 0; + + if ((kvm->arch.lpcr & mask) == lpcr) + return; + + kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; + + for (i = 0; i < KVM_MAX_VCORES; ++i) { + struct kvmppc_vcore *vc = kvm->arch.vcores[i]; + if (!vc) + continue; + spin_lock(&vc->lock); + vc->lpcr = (vc->lpcr & ~mask) | lpcr; + spin_unlock(&vc->lock); + if (++cores_done >= kvm->arch.online_vcores) + break; + } +} + +static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) +{ + return; +} + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -1737,7 +1867,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) unsigned long hva; struct kvm_memory_slot *memslot; struct vm_area_struct *vma; - unsigned long lpcr, senc; + unsigned long lpcr = 0, senc; + unsigned long lpcr_mask = 0; unsigned long psize, porder; unsigned long rma_size; unsigned long rmls; @@ -1802,9 +1933,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) senc = slb_pgsize_encoding(psize); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; - lpcr |= senc << (LPCR_VRMASD_SH - 4); - kvm->arch.lpcr = lpcr; + lpcr_mask = LPCR_VRMASD; + /* the -4 is to account for senc values starting at 0x10 */ + lpcr = senc << (LPCR_VRMASD_SH - 4); /* Create HPTEs in the hash page table for the VRMA */ kvmppc_map_vrma(vcpu, memslot, porder); @@ -1825,23 +1956,21 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) kvm->arch.rma = ri; /* Update LPCR and RMOR */ - lpcr = kvm->arch.lpcr; if (cpu_has_feature(CPU_FTR_ARCH_201)) { /* PPC970; insert RMLS value (split field) in HID4 */ - lpcr &= ~((1ul << HID4_RMLS0_SH) | - (3ul << HID4_RMLS2_SH)); - lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) | + lpcr_mask = (1ul << HID4_RMLS0_SH) | + (3ul << HID4_RMLS2_SH) | HID4_RMOR; + lpcr = ((rmls >> 2) << HID4_RMLS0_SH) | ((rmls & 3) << HID4_RMLS2_SH); /* RMOR is also in HID4 */ lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) << HID4_RMOR_SH; } else { /* POWER7 */ - lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); - lpcr |= rmls << LPCR_RMLS_SH; + lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS; + lpcr = rmls << LPCR_RMLS_SH; kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT; } - kvm->arch.lpcr = lpcr; pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); @@ -1860,6 +1989,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } } + kvmppc_update_lpcr(kvm, lpcr, lpcr_mask); + /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ smp_wmb(); kvm->arch.rma_setup_done = 1; @@ -1875,7 +2006,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; @@ -1893,9 +2024,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) */ cpumask_setall(&kvm->arch.need_tlb_flush); - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - INIT_LIST_HEAD(&kvm->arch.rtas_tokens); - kvm->arch.rma = NULL; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); @@ -1931,61 +2059,162 @@ int kvmppc_core_init_vm(struct kvm *kvm) return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_free_vcores(struct kvm *kvm) +{ + long int i; + + for (i = 0; i < KVM_MAX_VCORES; ++i) + kfree(kvm->arch.vcores[i]); + kvm->arch.online_vcores = 0; +} + +static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { uninhibit_secondary_onlining(); + kvmppc_free_vcores(kvm); if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; } - kvmppc_rtas_tokens_free(kvm); - kvmppc_free_hpt(kvm); - WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); } -/* These are stubs for now */ -void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) +/* We don't need to emulate any privileged instructions or dcbz */ +static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { + return EMULATE_FAIL; } -/* We don't need to emulate any privileged instructions or dcbz */ -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +static int kvmppc_core_check_processor_compat_hv(void) { - return EMULATE_FAIL; + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EIO; + return 0; } -static int kvmppc_book3s_hv_init(void) +static long kvm_arch_vm_ioctl_hv(struct file *filp, + unsigned int ioctl, unsigned long arg) { - int r; + struct kvm *kvm __maybe_unused = filp->private_data; + void __user *argp = (void __user *)arg; + long r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + switch (ioctl) { - if (r) + case KVM_ALLOCATE_RMA: { + struct kvm_allocate_rma rma; + struct kvm *kvm = filp->private_data; + + r = kvm_vm_ioctl_allocate_rma(kvm, &rma); + if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + r = -EFAULT; + break; + } + + case KVM_PPC_ALLOCATE_HTAB: { + u32 htab_order; + + r = -EFAULT; + if (get_user(htab_order, (u32 __user *)argp)) + break; + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + if (r) + break; + r = -EFAULT; + if (put_user(htab_order, (u32 __user *)argp)) + break; + r = 0; + break; + } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm_get_htab_fd ghf; + + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } + + default: + r = -ENOTTY; + } + + return r; +} + +static struct kvmppc_ops kvm_ops_hv = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, + .get_one_reg = kvmppc_get_one_reg_hv, + .set_one_reg = kvmppc_set_one_reg_hv, + .vcpu_load = kvmppc_core_vcpu_load_hv, + .vcpu_put = kvmppc_core_vcpu_put_hv, + .set_msr = kvmppc_set_msr_hv, + .vcpu_run = kvmppc_vcpu_run_hv, + .vcpu_create = kvmppc_core_vcpu_create_hv, + .vcpu_free = kvmppc_core_vcpu_free_hv, + .check_requests = kvmppc_core_check_requests_hv, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv, + .flush_memslot = kvmppc_core_flush_memslot_hv, + .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, + .commit_memory_region = kvmppc_core_commit_memory_region_hv, + .unmap_hva = kvm_unmap_hva_hv, + .unmap_hva_range = kvm_unmap_hva_range_hv, + .age_hva = kvm_age_hva_hv, + .test_age_hva = kvm_test_age_hva_hv, + .set_spte_hva = kvm_set_spte_hva_hv, + .mmu_destroy = kvmppc_mmu_destroy_hv, + .free_memslot = kvmppc_core_free_memslot_hv, + .create_memslot = kvmppc_core_create_memslot_hv, + .init_vm = kvmppc_core_init_vm_hv, + .destroy_vm = kvmppc_core_destroy_vm_hv, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, + .emulate_op = kvmppc_core_emulate_op_hv, + .emulate_mtspr = kvmppc_core_emulate_mtspr_hv, + .emulate_mfspr = kvmppc_core_emulate_mfspr_hv, + .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, + .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, +}; + +static int kvmppc_book3s_init_hv(void) +{ + int r; + /* + * FIXME!! Do we need to check on all cpus ? + */ + r = kvmppc_core_check_processor_compat_hv(); + if (r < 0) return r; - r = kvmppc_mmu_hv_init(); + kvm_ops_hv.owner = THIS_MODULE; + kvmppc_hv_ops = &kvm_ops_hv; + r = kvmppc_mmu_hv_init(); return r; } -static void kvmppc_book3s_hv_exit(void) +static void kvmppc_book3s_exit_hv(void) { - kvm_exit(); + kvmppc_hv_ops = NULL; } -module_init(kvmppc_book3s_hv_init); -module_exit(kvmppc_book3s_hv_exit); +module_init(kvmppc_book3s_init_hv); +module_exit(kvmppc_book3s_exit_hv); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 37f1cc417ca..928142c64cb 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -158,9 +158,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * Interrupts are enabled again at this point. */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c71103b8a74..bc8de75b192 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -33,30 +33,6 @@ #error Need to fix lppaca and SLB shadow accesses in little endian mode #endif -/***************************************************************************** - * * - * Real Mode handlers that need to be in the linear mapping * - * * - ****************************************************************************/ - - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - mfspr r13,SPRN_SRR0 - addi r13,r13,4 - mtspr SPRN_SRR0,r13 - GET_SCRATCH0(r13) - rfid - b . - - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - mfspr r13,SPRN_HSRR0 - addi r13,r13,4 - mtspr SPRN_HSRR0,r13 - GET_SCRATCH0(r13) - hrfid - b . - /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. @@ -66,8 +42,11 @@ kvmppc_skip_Hinterrupt: * LR = return address to continue at after eventually re-enabling MMU */ _GLOBAL(kvmppc_hv_entry_trampoline) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) mfmsr r10 - LOAD_REG_ADDR(r5, kvmppc_hv_entry) + LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) li r0,MSR_RI andc r0,r10,r0 li r6,MSR_IR | MSR_DR @@ -77,11 +56,103 @@ _GLOBAL(kvmppc_hv_entry_trampoline) mtsrr1 r6 RFI -/****************************************************************************** - * * - * Entry code * - * * - *****************************************************************************/ +kvmppc_call_hv_entry: + bl kvmppc_hv_entry + + /* Back from guest - restore host state and return to caller */ + + /* Restore host DABR and DABRX */ + ld r5,HSTATE_DABR(r13) + li r6,7 + mtspr SPRN_DABR,r5 + mtspr SPRN_DABRX,r6 + + /* Restore SPRG3 */ + ld r3,PACA_SPRG3(r13) + mtspr SPRN_SPRG3,r3 + + /* + * Reload DEC. HDEC interrupts were disabled when + * we reloaded the host's LPCR value. + */ + ld r3, HSTATE_DECEXP(r13) + mftb r4 + subf r4, r4, r3 + mtspr SPRN_DEC, r4 + + /* Reload the host's PMU registers */ + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r4, LPPACA_PMCINUSE(r3) + cmpwi r4, 0 + beq 23f /* skip if not */ + lwz r3, HSTATE_PMC(r13) + lwz r4, HSTATE_PMC + 4(r13) + lwz r5, HSTATE_PMC + 8(r13) + lwz r6, HSTATE_PMC + 12(r13) + lwz r8, HSTATE_PMC + 16(r13) + lwz r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + lwz r10, HSTATE_PMC + 24(r13) + lwz r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, HSTATE_MMCR(r13) + ld r4, HSTATE_MMCR + 8(r13) + ld r5, HSTATE_MMCR + 16(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_MMCR0, r3 + isync +23: + + /* + * For external and machine check interrupts, we need + * to call the Linux handler to process the interrupt. + * We do that by jumping to absolute address 0x500 for + * external interrupts, or the machine_check_fwnmi label + * for machine checks (since firmware might have patched + * the vector area at 0x200). The [h]rfid at the end of the + * handler will return to the book3s_hv_interrupts.S code. + * For other interrupts we do the rfid to get back + * to the book3s_hv_interrupts.S code here. + */ + ld r8, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + ld r7, HSTATE_HOST_MSR(r13) + + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL +BEGIN_FTR_SECTION + beq 11f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* RFI into the highmem handler, or branch to interrupt handler */ + mfmsr r6 + li r0, MSR_RI + andc r6, r6, r0 + mtmsrd r6, 1 /* Clear RI in MSR */ + mtsrr0 r8 + mtsrr1 r7 + beqa 0x500 /* external interrupt (PPC970) */ + beq cr1, 13f /* machine check */ + RFI + + /* On POWER7, we have external interrupts set to use HSRR0/1 */ +11: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0x500 + +13: b machine_check_fwnmi + /* * We come in here when wakened from nap mode on a secondary hw thread. @@ -137,7 +208,7 @@ kvm_start_guest: cmpdi r4,0 /* if we have no vcpu to run, go back to sleep */ beq kvm_no_guest - b kvmppc_hv_entry + b 30f 27: /* XXX should handle hypervisor maintenance interrupts etc. here */ b kvm_no_guest @@ -147,6 +218,57 @@ kvm_start_guest: stw r8,HSTATE_SAVED_XIRR(r13) b kvm_no_guest +30: bl kvmppc_hv_entry + + /* Back from the guest, go back to nap */ + /* Clear our vcpu pointer so we don't come back in early */ + li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) + lwsync + /* Clear any pending IPI - we're an offline thread */ + ld r5, HSTATE_XICS_PHYS(r13) + li r7, XICS_XIRR + lwzcix r3, r5, r7 /* ack any pending interrupt */ + rlwinm. r0, r3, 0, 0xffffff /* any pending? */ + beq 37f + sync + li r0, 0xff + li r6, XICS_MFRR + stbcix r0, r5, r6 /* clear the IPI */ + stwcix r3, r5, r7 /* EOI it */ +37: sync + + /* increment the nap count and then go to nap mode */ + ld r4, HSTATE_KVM_VCORE(r13) + addi r4, r4, VCORE_NAP_COUNT + lwsync /* make previous updates visible */ +51: lwarx r3, 0, r4 + addi r3, r3, 1 + stwcx. r3, 0, r4 + bne 51b + +kvm_no_guest: + li r0, KVM_HWTHREAD_IN_NAP + stb r0, HSTATE_HWTHREAD_STATE(r13) + li r3, LPCR_PECE0 + mfspr r4, SPRN_LPCR + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR, r4 + isync + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + .global kvmppc_hv_entry kvmppc_hv_entry: @@ -159,7 +281,8 @@ kvmppc_hv_entry: * all other volatile GPRS = free */ mflr r0 - std r0, HSTATE_VMHANDLER(r13) + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) /* Set partition DABR */ /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ @@ -200,8 +323,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) ld r3, VCPU_MMCR(r4) ld r5, VCPU_MMCR + 8(r4) ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) mtspr SPRN_MMCR1, r5 mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 mtspr SPRN_MMCR0, r3 isync @@ -254,22 +381,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) - /* Increment yield count if they have a VPA */ - ld r3, VCPU_VPA(r4) - cmpdi r3, 0 - beq 25f - lwz r5, LPPACA_YIELDCOUNT(r3) - addi r5, r5, 1 - stw r5, LPPACA_YIELDCOUNT(r3) - li r6, 1 - stb r6, VCPU_VPA_DIRTY(r4) -25: /* Load up DAR and DSISR */ ld r5, VCPU_DAR(r4) lwz r6, VCPU_DSISR(r4) mtspr SPRN_DAR, r5 mtspr SPRN_DSISR, r6 + li r6, KVM_GUEST_MODE_HOST_HV + stb r6, HSTATE_IN_GUEST(r13) + BEGIN_FTR_SECTION /* Restore AMR and UAMOR, set AMOR to all 1s */ ld r5,VCPU_AMR(r4) @@ -343,7 +463,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) bdnz 28b ptesync -22: li r0,1 + /* Add timebase offset onto timebase */ +22: ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 37f + mftb r6 /* current host timebase */ + add r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 37f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Load guest PCR value to select appropriate compat mode */ +37: ld r7, VCORE_PCR(r5) + cmpdi r7, 0 + beq 38f + mtspr SPRN_PCR, r7 +38: + li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ b 10f @@ -353,12 +494,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) beq 20b /* Set LPCR and RMOR. */ -10: ld r8,KVM_LPCR(r9) +10: ld r8,VCORE_LPCR(r5) mtspr SPRN_LPCR,r8 ld r8,KVM_RMOR(r9) mtspr SPRN_RMOR,r8 isync + /* Increment yield count if they have a VPA */ + ld r3, VCPU_VPA(r4) + cmpdi r3, 0 + beq 25f + lwz r5, LPPACA_YIELDCOUNT(r3) + addi r5, r5, 1 + stw r5, LPPACA_YIELDCOUNT(r3) + li r6, 1 + stb r6, VCPU_VPA_DIRTY(r4) +25: /* Check if HDEC expires soon */ mfspr r3,SPRN_HDEC cmpwi r3,10 @@ -405,7 +556,8 @@ toc_tlbie_lock: bne 24b isync - ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r7,VCORE_LPCR(r5) /* use vcore->lpcr to store HID4 */ li r0,0x18f rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ or r0,r7,r0 @@ -541,7 +693,7 @@ fast_guest_return: mtspr SPRN_HSRR1,r11 /* Activate guest mode, so faults get handled by KVM */ - li r9, KVM_GUEST_MODE_GUEST + li r9, KVM_GUEST_MODE_GUEST_HV stb r9, HSTATE_IN_GUEST(r13) /* Enter guest */ @@ -550,13 +702,15 @@ BEGIN_FTR_SECTION ld r5, VCPU_CFAR(r4) mtspr SPRN_CFAR, r5 END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r0, VCPU_PPR(r4) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r5, VCPU_LR(r4) lwz r6, VCPU_CR(r4) mtlr r5 mtcr r6 - ld r0, VCPU_GPR(R0)(r4) ld r1, VCPU_GPR(R1)(r4) ld r2, VCPU_GPR(R2)(r4) ld r3, VCPU_GPR(R3)(r4) @@ -570,6 +724,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r12, VCPU_GPR(R12)(r4) ld r13, VCPU_GPR(R13)(r4) +BEGIN_FTR_SECTION + mtspr SPRN_PPR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) hrfid @@ -584,8 +742,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) /* * We come here from the first-level interrupt handlers. */ - .globl kvmppc_interrupt -kvmppc_interrupt: + .globl kvmppc_interrupt_hv +kvmppc_interrupt_hv: /* * Register contents: * R12 = interrupt vector @@ -595,6 +753,19 @@ kvmppc_interrupt: */ /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ std r9, HSTATE_HOST_R2(r13) + + lbz r9, HSTATE_IN_GUEST(r13) + cmpwi r9, KVM_GUEST_MODE_HOST_HV + beq kvmppc_bad_host_intr +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + cmpwi r9, KVM_GUEST_MODE_GUEST + ld r9, HSTATE_HOST_R2(r13) + beq kvmppc_interrupt_pr +#endif + /* We're now back in the host but in guest MMU context */ + li r9, KVM_GUEST_MODE_HOST_HV + stb r9, HSTATE_IN_GUEST(r13) + ld r9, HSTATE_KVM_VCPU(r13) /* Save registers */ @@ -620,6 +791,10 @@ BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) std r3, VCPU_CFAR(r9) END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r4, HSTATE_PPR(r13) + std r4, VCPU_PPR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Restore R1/R2 so we can handle faults */ ld r1, HSTATE_HOST_R1(r13) @@ -642,10 +817,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) std r3, VCPU_GPR(R13)(r9) std r4, VCPU_LR(r9) - /* Unset guest mode */ - li r0, KVM_GUEST_MODE_NONE - stb r0, HSTATE_IN_GUEST(r13) - stw r12,VCPU_TRAP(r9) /* Save HEIR (HV emulation assist reg) in last_inst @@ -696,46 +867,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) * set, we know the host wants us out so let's do it now */ do_ext_interrupt: - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne ext_interrupt_to_host - - /* Now read the interrupt from the ICP */ - ld r5, HSTATE_XICS_PHYS(r13) - li r7, XICS_XIRR - cmpdi r5, 0 - beq- ext_interrupt_to_host - lwzcix r3, r5, r7 - rlwinm. r0, r3, 0, 0xffffff - sync - beq 3f /* if nothing pending in the ICP */ - - /* We found something in the ICP... - * - * If it's not an IPI, stash it in the PACA and return to - * the host, we don't (yet) handle directing real external - * interrupts directly to the guest - */ - cmpwi r0, XICS_IPI - bne ext_stash_for_host - - /* It's an IPI, clear the MFRR and EOI it */ - li r0, 0xff - li r6, XICS_MFRR - stbcix r0, r5, r6 /* clear the IPI */ - stwcix r3, r5, r7 /* EOI it */ - sync - - /* We need to re-check host IPI now in case it got set in the - * meantime. If it's clear, we bounce the interrupt to the - * guest - */ - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne- 1f + bl kvmppc_read_intr + cmpdi r3, 0 + bgt ext_interrupt_to_host /* Allright, looks like an IPI for the guest, we need to set MER */ -3: /* Check if any CPU is heading out to the host, if so head out too */ ld r5, HSTATE_KVM_VCORE(r13) lwz r0, VCORE_ENTRY_EXIT(r5) @@ -764,27 +900,9 @@ do_ext_interrupt: mtspr SPRN_LPCR, r8 b fast_guest_return - /* We raced with the host, we need to resend that IPI, bummer */ -1: li r0, IPI_PRIORITY - stbcix r0, r5, r6 /* set the IPI */ - sync - b ext_interrupt_to_host - -ext_stash_for_host: - /* It's not an IPI and it's for the host, stash it in the PACA - * before exit, it will be picked up by the host ICP driver - */ - stw r3, HSTATE_SAVED_XIRR(r13) ext_interrupt_to_host: guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ - /* Save DEC */ - mfspr r5,SPRN_DEC - mftb r6 - extsw r5,r5 - add r5,r5,r6 - std r5,VCPU_DEC_EXPIRES(r9) - /* Save more register state */ mfdar r6 mfdsisr r7 @@ -954,7 +1072,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync - li r0,0 + + /* Subtract timebase offset from timebase */ + ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 17f + mftb r6 /* current host timebase */ + subf r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 17f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Reset PCR */ +17: ld r0, VCORE_PCR(r5) + cmpdi r0, 0 + beq 18f + li r0, 0 + mtspr SPRN_PCR, r0 +18: + /* Signal secondary CPUs to continue */ stb r0,VCORE_IN_GUEST(r5) lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 @@ -1052,6 +1193,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 1: addi r8,r8,16 .endr + /* Save DEC */ + mfspr r5,SPRN_DEC + mftb r6 + extsw r5,r5 + add r5,r5,r6 + std r5,VCPU_DEC_EXPIRES(r9) + /* Save and reset AMR and UAMOR before turning on the MMU */ BEGIN_FTR_SECTION mfspr r5,SPRN_AMR @@ -1062,6 +1210,10 @@ BEGIN_FTR_SECTION mtspr SPRN_AMR,r6 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* Unset guest mode */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + /* Switch DSCR back to host value */ BEGIN_FTR_SECTION mfspr r8, SPRN_DSCR @@ -1134,9 +1286,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ b 22f 21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR std r4, VCPU_MMCR(r9) std r5, VCPU_MMCR + 8(r9) std r6, VCPU_MMCR + 16(r9) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) mfspr r3, SPRN_PMC1 mfspr r4, SPRN_PMC2 mfspr r5, SPRN_PMC3 @@ -1158,103 +1314,30 @@ BEGIN_FTR_SECTION stw r11, VCPU_PMC + 28(r9) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 22: + ld r0, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + mtlr r0 + blr +secondary_too_late: + ld r5,HSTATE_KVM_VCORE(r13) + HMT_LOW +13: lbz r3,VCORE_IN_GUEST(r5) + cmpwi r3,0 + bne 13b + HMT_MEDIUM + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + ld r11,PACA_SLBSHADOWPTR(r13) - /* Secondary threads go off to take a nap on POWER7 */ -BEGIN_FTR_SECTION - lwz r0,VCPU_PTID(r9) - cmpwi r0,0 - bne secondary_nap -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* Restore host DABR and DABRX */ - ld r5,HSTATE_DABR(r13) - li r6,7 - mtspr SPRN_DABR,r5 - mtspr SPRN_DABRX,r6 - - /* Restore SPRG3 */ - ld r3,PACA_SPRG3(r13) - mtspr SPRN_SPRG3,r3 - - /* - * Reload DEC. HDEC interrupts were disabled when - * we reloaded the host's LPCR value. - */ - ld r3, HSTATE_DECEXP(r13) - mftb r4 - subf r4, r4, r3 - mtspr SPRN_DEC, r4 - - /* Reload the host's PMU registers */ - ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ - lbz r4, LPPACA_PMCINUSE(r3) - cmpwi r4, 0 - beq 23f /* skip if not */ - lwz r3, HSTATE_PMC(r13) - lwz r4, HSTATE_PMC + 4(r13) - lwz r5, HSTATE_PMC + 8(r13) - lwz r6, HSTATE_PMC + 12(r13) - lwz r8, HSTATE_PMC + 16(r13) - lwz r9, HSTATE_PMC + 20(r13) -BEGIN_FTR_SECTION - lwz r10, HSTATE_PMC + 24(r13) - lwz r11, HSTATE_PMC + 28(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r4 - mtspr SPRN_PMC3, r5 - mtspr SPRN_PMC4, r6 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 -BEGIN_FTR_SECTION - mtspr SPRN_PMC7, r10 - mtspr SPRN_PMC8, r11 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - ld r3, HSTATE_MMCR(r13) - ld r4, HSTATE_MMCR + 8(r13) - ld r5, HSTATE_MMCR + 16(r13) - mtspr SPRN_MMCR1, r4 - mtspr SPRN_MMCRA, r5 - mtspr SPRN_MMCR0, r3 - isync -23: - /* - * For external and machine check interrupts, we need - * to call the Linux handler to process the interrupt. - * We do that by jumping to absolute address 0x500 for - * external interrupts, or the machine_check_fwnmi label - * for machine checks (since firmware might have patched - * the vector area at 0x200). The [h]rfid at the end of the - * handler will return to the book3s_hv_interrupts.S code. - * For other interrupts we do the rfid to get back - * to the book3s_hv_interrupts.S code here. - */ - ld r8, HSTATE_VMHANDLER(r13) - ld r7, HSTATE_HOST_MSR(r13) - - cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL -BEGIN_FTR_SECTION - beq 11f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* RFI into the highmem handler, or branch to interrupt handler */ - mfmsr r6 - li r0, MSR_RI - andc r6, r6, r0 - mtmsrd r6, 1 /* Clear RI in MSR */ - mtsrr0 r8 - mtsrr1 r7 - beqa 0x500 /* external interrupt (PPC970) */ - beq cr1, 13f /* machine check */ - RFI - - /* On POWER7, we have external interrupts set to use HSRR0/1 */ -11: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0x500 - -13: b machine_check_fwnmi + .rept SLB_NUM_BOLTED + ld r5,SLBSHADOW_SAVEAREA(r11) + ld r6,SLBSHADOW_SAVEAREA+8(r11) + andis. r7,r5,SLB_ESID_V@h + beq 1f + slbmte r6,r5 +1: addi r11,r11,16 + .endr + b 22b /* * Check whether an HDSI is an HPTE not found fault or something else. @@ -1333,7 +1416,7 @@ fast_interrupt_c_return: stw r8, VCPU_LAST_INST(r9) /* Unset guest mode. */ - li r0, KVM_GUEST_MODE_NONE + li r0, KVM_GUEST_MODE_HOST_HV stb r0, HSTATE_IN_GUEST(r13) b guest_exit_cont @@ -1701,67 +1784,70 @@ machine_check_realmode: rotldi r11, r11, 63 b fast_interrupt_c_return -secondary_too_late: - ld r5,HSTATE_KVM_VCORE(r13) - HMT_LOW -13: lbz r3,VCORE_IN_GUEST(r5) - cmpwi r3,0 - bne 13b - HMT_MEDIUM - ld r11,PACA_SLBSHADOWPTR(r13) - - .rept SLB_NUM_BOLTED - ld r5,SLBSHADOW_SAVEAREA(r11) - ld r6,SLBSHADOW_SAVEAREA+8(r11) - andis. r7,r5,SLB_ESID_V@h - beq 1f - slbmte r6,r5 -1: addi r11,r11,16 - .endr +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + */ +kvmppc_read_intr: + /* see if a host IPI is pending */ + li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne 1f -secondary_nap: - /* Clear our vcpu pointer so we don't come back in early */ - li r0, 0 - std r0, HSTATE_KVM_VCPU(r13) - lwsync - /* Clear any pending IPI - assume we're a secondary thread */ - ld r5, HSTATE_XICS_PHYS(r13) + /* Now read the interrupt from the ICP */ + ld r6, HSTATE_XICS_PHYS(r13) li r7, XICS_XIRR - lwzcix r3, r5, r7 /* ack any pending interrupt */ - rlwinm. r0, r3, 0, 0xffffff /* any pending? */ - beq 37f + cmpdi r6, 0 + beq- 1f + lwzcix r0, r6, r7 + rlwinm. r3, r0, 0, 0xffffff sync - li r0, 0xff - li r6, XICS_MFRR - stbcix r0, r5, r6 /* clear the IPI */ - stwcix r3, r5, r7 /* EOI it */ -37: sync + beq 1f /* if nothing pending in the ICP */ - /* increment the nap count and then go to nap mode */ - ld r4, HSTATE_KVM_VCORE(r13) - addi r4, r4, VCORE_NAP_COUNT - lwsync /* make previous updates visible */ -51: lwarx r3, 0, r4 - addi r3, r3, 1 - stwcx. r3, 0, r4 - bne 51b + /* We found something in the ICP... + * + * If it's not an IPI, stash it in the PACA and return to + * the host, we don't (yet) handle directing real external + * interrupts directly to the guest + */ + cmpwi r3, XICS_IPI /* if there is, is it an IPI? */ + li r3, 1 + bne 42f -kvm_no_guest: - li r0, KVM_HWTHREAD_IN_NAP - stb r0, HSTATE_HWTHREAD_STATE(r13) + /* It's an IPI, clear the MFRR and EOI it */ + li r3, 0xff + li r8, XICS_MFRR + stbcix r3, r6, r8 /* clear the IPI */ + stwcix r0, r6, r7 /* EOI it */ + sync - li r3, LPCR_PECE0 - mfspr r4, SPRN_LPCR - rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 - mtspr SPRN_LPCR, r4 - isync - std r0, HSTATE_SCRATCH0(r13) - ptesync - ld r0, HSTATE_SCRATCH0(r13) -1: cmpd r0, r0 - bne 1b - nap - b . + /* We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne- 43f + + /* OK, it's an IPI for us */ + li r3, -1 +1: blr + +42: /* It's not an IPI and it's for the host, stash it in the PACA + * before exit, it will be picked up by the host ICP driver + */ + stw r0, HSTATE_SAVED_XIRR(r13) + b 1b + +43: /* We raced with the host, we need to resend that IPI, bummer */ + li r0, IPI_PRIORITY + stbcix r0, r6, r8 /* set the IPI */ + sync + b 1b /* * Save away FP, VMX and VSX registers. @@ -1879,3 +1965,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) lwz r7,VCPU_VRSAVE(r4) mtspr SPRN_VRSAVE,r7 blr + +/* + * We come here if we get any exception or interrupt while we are + * executing host real mode code while in guest MMU context. + * For now just spin, but we should do something better. + */ +kvmppc_bad_host_intr: + b . diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 17cfae5497a..f4dd041c14e 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -26,8 +26,12 @@ #if defined(CONFIG_PPC_BOOK3S_64) #define FUNC(name) GLUE(.,name) +#define GET_SHADOW_VCPU(reg) addi reg, r13, PACA_SVCPU + #elif defined(CONFIG_PPC_BOOK3S_32) #define FUNC(name) name +#define GET_SHADOW_VCPU(reg) lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) + #endif /* CONFIG_PPC_BOOK3S_XX */ #define VCPU_LOAD_NVGPRS(vcpu) \ @@ -87,8 +91,14 @@ kvm_start_entry: VCPU_LOAD_NVGPRS(r4) kvm_start_lightweight: + /* Copy registers into shadow vcpu so we can access them in real mode */ + GET_SHADOW_VCPU(r3) + bl FUNC(kvmppc_copy_to_svcpu) + nop + REST_GPR(4, r1) #ifdef CONFIG_PPC_BOOK3S_64 + /* Get the dcbz32 flag */ PPC_LL r3, VCPU_HFLAGS(r4) rldicl r3, r3, 0, 63 /* r3 &= 1 */ stb r3, HSTATE_RESTORE_HID5(r13) @@ -111,9 +121,6 @@ kvm_start_lightweight: * */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * @@ -125,18 +132,31 @@ kvmppc_handler_highmem: * */ - /* R7 = vcpu */ - PPC_LL r7, GPR4(r1) + /* Transfer reg values from shadow vcpu back to vcpu struct */ + /* On 64-bit, interrupts are still off at this point */ + PPC_LL r3, GPR4(r1) /* vcpu pointer */ + GET_SHADOW_VCPU(r4) + bl FUNC(kvmppc_copy_from_svcpu) + nop #ifdef CONFIG_PPC_BOOK3S_64 + /* Re-enable interrupts */ + ld r3, HSTATE_HOST_MSR(r13) + ori r3, r3, MSR_EE + MTMSR_EERI(r3) + /* * Reload kernel SPRG3 value. * No need to save guest value as usermode can't modify SPRG3. */ ld r3, PACA_SPRG3(r13) mtspr SPRN_SPRG3, r3 + #endif /* CONFIG_PPC_BOOK3S_64 */ + /* R7 = vcpu */ + PPC_LL r7, GPR4(r1) + PPC_STL r14, VCPU_GPR(R14)(r7) PPC_STL r15, VCPU_GPR(R15)(r7) PPC_STL r16, VCPU_GPR(R16)(r7) @@ -161,7 +181,7 @@ kvmppc_handler_highmem: /* Restore r3 (kvm_run) and r4 (vcpu) */ REST_2GPRS(3, r1) - bl FUNC(kvmppc_handle_exit) + bl FUNC(kvmppc_handle_exit_pr) /* If RESUME_GUEST, get back in the loop */ cmpwi r3, RESUME_GUEST diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index da8b13c4b77..5a1ab1250a0 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -28,7 +28,7 @@ #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 @@ -56,6 +56,14 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) HPTEG_HASH_BITS_VPTE_LONG); } +#ifdef CONFIG_PPC_BOOK3S_64 +static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage) +{ + return hash_64((vpage & 0xffffffff0ULL) >> 4, + HPTEG_HASH_BITS_VPTE_64K); +} +#endif + void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { u64 index; @@ -83,6 +91,15 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_add_head_rcu(&pte->list_vpte_long, &vcpu3s->hpte_hash_vpte_long[index]); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Add to vPTE_64k list */ + index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage); + hlist_add_head_rcu(&pte->list_vpte_64k, + &vcpu3s->hpte_hash_vpte_64k[index]); +#endif + + vcpu3s->hpte_cache_count++; + spin_unlock(&vcpu3s->mmu_lock); } @@ -113,10 +130,13 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_del_init_rcu(&pte->list_pte_long); hlist_del_init_rcu(&pte->list_vpte); hlist_del_init_rcu(&pte->list_vpte_long); +#ifdef CONFIG_PPC_BOOK3S_64 + hlist_del_init_rcu(&pte->list_vpte_64k); +#endif + vcpu3s->hpte_cache_count--; spin_unlock(&vcpu3s->mmu_lock); - vcpu3s->hpte_cache_count--; call_rcu(&pte->rcu_head, free_pte_rcu); } @@ -219,6 +239,29 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) rcu_read_unlock(); } +#ifdef CONFIG_PPC_BOOK3S_64 +/* Flush with mask 0xffffffff0 */ +static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + u64 vp_mask = 0xffffffff0ULL; + + list = &vcpu3s->hpte_hash_vpte_64k[ + kvmppc_mmu_hash_vpte_64k(guest_vp)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_vpte_64k) + if ((pte->pte.vpage & vp_mask) == guest_vp) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} +#endif + /* Flush with mask 0xffffff000 */ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) { @@ -249,6 +292,11 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) case 0xfffffffffULL: kvmppc_mmu_pte_vflush_short(vcpu, guest_vp); break; +#ifdef CONFIG_PPC_BOOK3S_64 + case 0xffffffff0ULL: + kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp); + break; +#endif case 0xffffff000ULL: kvmppc_mmu_pte_vflush_long(vcpu, guest_vp); break; @@ -285,15 +333,19 @@ struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; - pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); - vcpu3s->hpte_cache_count++; - if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM) kvmppc_mmu_pte_flush_all(vcpu); + pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); + return pte; } +void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte) +{ + kmem_cache_free(hpte_cache, pte); +} + void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu) { kvmppc_mmu_pte_flush(vcpu, 0, 0); @@ -320,6 +372,10 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); +#ifdef CONFIG_PPC_BOOK3S_64 + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k)); +#endif spin_lock_init(&vcpu3s->mmu_lock); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index c0b48f96a91..fe14ca3dd17 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -40,8 +40,12 @@ #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/module.h> -#include "trace.h" +#include "book3s.h" + +#define CREATE_TRACE_POINTS +#include "trace_pr.h" /* #define EXIT_DEBUG */ /* #define DEBUG_EXT */ @@ -56,29 +60,25 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define HW_PAGE_SIZE PAGE_SIZE #endif -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_PPC_BOOK3S_64 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); - memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; svcpu_put(svcpu); #endif vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 - current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; + current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; #endif } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PPC_BOOK3S_64 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); - memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; svcpu_put(svcpu); #endif @@ -87,7 +87,61 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +/* Copy data needed by real-mode code from vcpu to shadow vcpu */ +void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, + struct kvm_vcpu *vcpu) +{ + svcpu->gpr[0] = vcpu->arch.gpr[0]; + svcpu->gpr[1] = vcpu->arch.gpr[1]; + svcpu->gpr[2] = vcpu->arch.gpr[2]; + svcpu->gpr[3] = vcpu->arch.gpr[3]; + svcpu->gpr[4] = vcpu->arch.gpr[4]; + svcpu->gpr[5] = vcpu->arch.gpr[5]; + svcpu->gpr[6] = vcpu->arch.gpr[6]; + svcpu->gpr[7] = vcpu->arch.gpr[7]; + svcpu->gpr[8] = vcpu->arch.gpr[8]; + svcpu->gpr[9] = vcpu->arch.gpr[9]; + svcpu->gpr[10] = vcpu->arch.gpr[10]; + svcpu->gpr[11] = vcpu->arch.gpr[11]; + svcpu->gpr[12] = vcpu->arch.gpr[12]; + svcpu->gpr[13] = vcpu->arch.gpr[13]; + svcpu->cr = vcpu->arch.cr; + svcpu->xer = vcpu->arch.xer; + svcpu->ctr = vcpu->arch.ctr; + svcpu->lr = vcpu->arch.lr; + svcpu->pc = vcpu->arch.pc; +} + +/* Copy data touched by real-mode code from shadow vcpu back to vcpu */ +void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, + struct kvmppc_book3s_shadow_vcpu *svcpu) +{ + vcpu->arch.gpr[0] = svcpu->gpr[0]; + vcpu->arch.gpr[1] = svcpu->gpr[1]; + vcpu->arch.gpr[2] = svcpu->gpr[2]; + vcpu->arch.gpr[3] = svcpu->gpr[3]; + vcpu->arch.gpr[4] = svcpu->gpr[4]; + vcpu->arch.gpr[5] = svcpu->gpr[5]; + vcpu->arch.gpr[6] = svcpu->gpr[6]; + vcpu->arch.gpr[7] = svcpu->gpr[7]; + vcpu->arch.gpr[8] = svcpu->gpr[8]; + vcpu->arch.gpr[9] = svcpu->gpr[9]; + vcpu->arch.gpr[10] = svcpu->gpr[10]; + vcpu->arch.gpr[11] = svcpu->gpr[11]; + vcpu->arch.gpr[12] = svcpu->gpr[12]; + vcpu->arch.gpr[13] = svcpu->gpr[13]; + vcpu->arch.cr = svcpu->cr; + vcpu->arch.xer = svcpu->xer; + vcpu->arch.ctr = svcpu->ctr; + vcpu->arch.lr = svcpu->lr; + vcpu->arch.pc = svcpu->pc; + vcpu->arch.shadow_srr1 = svcpu->shadow_srr1; + vcpu->arch.fault_dar = svcpu->fault_dar; + vcpu->arch.fault_dsisr = svcpu->fault_dsisr; + vcpu->arch.last_inst = svcpu->last_inst; +} + +static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) { int r = 1; /* Indicate we want to get back into the guest */ @@ -100,44 +154,69 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) } /************* MMU Notifiers *************/ +static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + long i; + struct kvm_vcpu *vcpu; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, + gfn_end << PAGE_SHIFT); + } +} -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) { trace_kvm_unmap_hva(hva); - /* - * Flush all shadow tlb entries everywhere. This is slow, but - * we are 100% sure that we catch the to be unmapped page - */ - kvm_flush_remote_tlbs(kvm); + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, + unsigned long end) { - /* kvm_unmap_hva flushes everything anyways */ - kvm_unmap_hva(kvm, start); + do_kvm_unmap_hva(kvm, start, end); return 0; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva) { /* XXX could be more clever ;) */ return 0; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) { /* XXX could be more clever ;) */ return 0; } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) { /* The page will get remapped properly on its next fault */ - kvm_unmap_hva(kvm, hva); + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); } /*****************************************/ @@ -159,7 +238,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) vcpu->arch.shadow_msr = smsr; } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) { ulong old_msr = vcpu->arch.shared->msr; @@ -219,7 +298,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) { u32 host_pvr; @@ -256,6 +335,23 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); + /* + * If they're asking for POWER6 or later, set the flag + * indicating that we can do multiple large page sizes + * and 1TB segments. + * Also set the flag that indicates that tlbie has the large + * page bit in the RB operand instead of the instruction. + */ + switch (PVR_VER(pvr)) { + case PVR_POWER6: + case PVR_POWER7: + case PVR_POWER7p: + case PVR_POWER8: + vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | + BOOK3S_HFLAG_NEW_TLBIE; + break; + } + #ifdef CONFIG_PPC_BOOK3S_32 /* 32 bit Book3S always has 32 byte dcbz */ vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; @@ -334,6 +430,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, ulong eaddr, int vec) { bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); + bool iswrite = false; int r = RESUME_GUEST; int relocated; int page_found = 0; @@ -344,10 +441,12 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 vsid; relocated = data ? dr : ir; + if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE)) + iswrite = true; /* Resolve real address if translation turned on */ if (relocated) { - page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); + page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite); } else { pte.may_execute = true; pte.may_read = true; @@ -355,6 +454,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.raddr = eaddr & KVM_PAM; pte.eaddr = eaddr; pte.vpage = eaddr >> 12; + pte.page_size = MMU_PAGE_64K; } switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { @@ -388,22 +488,18 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = svcpu->fault_dsisr; + vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr; vcpu->arch.shared->msr |= - (svcpu->shadow_srr1 & 0x00000000f8000000ULL); - svcpu_put(svcpu); + vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL; kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE; + vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE; vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->msr |= - svcpu->shadow_srr1 & 0x00000000f8000000ULL; - svcpu_put(svcpu); + vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL; kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -411,12 +507,20 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); } else if (!is_mmio && kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { + if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) { + /* + * There is already a host HPTE there, presumably + * a read-only one for a page the guest thinks + * is writable, so get rid of it first. + */ + kvmppc_mmu_unmap_page(vcpu, &pte); + } /* The guest's PTE is not mapped yet. Map on the host */ - kvmppc_mmu_map_page(vcpu, &pte); + kvmppc_mmu_map_page(vcpu, &pte, iswrite); if (data) vcpu->stat.sp_storage++; else if (vcpu->arch.mmu.is_dcbz32(vcpu) && - (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) kvmppc_patch_dcbz(vcpu, &pte); } else { /* MMIO */ @@ -619,13 +723,15 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu) if (lost_ext & MSR_FP) kvmppc_load_up_fpu(); +#ifdef CONFIG_ALTIVEC if (lost_ext & MSR_VEC) kvmppc_load_up_altivec(); +#endif current->thread.regs->msr |= lost_ext; } -int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int exit_nr) +int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) { int r = RESUME_HOST; int s; @@ -643,25 +749,32 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong shadow_srr1 = svcpu->shadow_srr1; + ulong shadow_srr1 = vcpu->arch.shadow_srr1; vcpu->stat.pf_instruc++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); - r = RESUME_GUEST; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]; svcpu_put(svcpu); - break; + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + r = RESUME_GUEST; + break; + } } #endif - svcpu_put(svcpu); /* only care about PTEG not found errors, but leave NX alone */ if (shadow_srr1 & 0x40000000) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { @@ -682,25 +795,36 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_DATA_STORAGE: { ulong dar = kvmppc_get_fault_dar(vcpu); - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 fault_dsisr = svcpu->fault_dsisr; + u32 fault_dsisr = vcpu->arch.fault_dsisr; vcpu->stat.pf_storage++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, dar); - r = RESUME_GUEST; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[dar >> SID_SHIFT]; svcpu_put(svcpu); - break; + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, dar); + r = RESUME_GUEST; + break; + } } #endif - svcpu_put(svcpu); - /* The only case we need to handle is missing shadow PTEs */ - if (fault_dsisr & DSISR_NOHPTE) { + /* + * We need to handle missing shadow PTEs, and + * protection faults due to us mapping a page read-only + * when the guest thinks it is writable. + */ + if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } else { vcpu->arch.shared->dar = dar; vcpu->arch.shared->dsisr = fault_dsisr; @@ -743,13 +867,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_H_EMUL_ASSIST: { enum emulation_result er; - struct kvmppc_book3s_shadow_vcpu *svcpu; ulong flags; program_interrupt: - svcpu = svcpu_get(vcpu); - flags = svcpu->shadow_srr1 & 0x1f0000ull; - svcpu_put(svcpu); + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; if (vcpu->arch.shared->msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -798,7 +919,7 @@ program_interrupt: ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; -#ifdef CONFIG_KVM_BOOK3S_64_PR +#ifdef CONFIG_PPC_BOOK3S_64 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; @@ -881,9 +1002,7 @@ program_interrupt: break; default: { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong shadow_srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); + ulong shadow_srr1 = vcpu->arch.shadow_srr1; /* Ugh - bork here! What did we get? */ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); @@ -920,8 +1039,8 @@ program_interrupt: return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; @@ -947,13 +1066,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_pr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { @@ -983,7 +1102,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; @@ -1012,7 +1132,8 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; @@ -1042,28 +1163,30 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_core_check_processor_compat(void) -{ - return 0; -} - -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; struct kvm_vcpu *vcpu; int err = -ENOMEM; unsigned long p; - vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); - if (!vcpu_book3s) + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) goto out; - vcpu_book3s->shadow_vcpu = - kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); - if (!vcpu_book3s->shadow_vcpu) + vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); + if (!vcpu_book3s) goto free_vcpu; + vcpu->arch.book3s = vcpu_book3s; + +#ifdef CONFIG_KVM_BOOK3S_32 + vcpu->arch.shadow_vcpu = + kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL); + if (!vcpu->arch.shadow_vcpu) + goto free_vcpu3s; +#endif - vcpu = &vcpu_book3s->vcpu; err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_shadow_vcpu; @@ -1076,13 +1199,19 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096); #ifdef CONFIG_PPC_BOOK3S_64 - /* default to book3s_64 (970fx) */ + /* + * Default to the same as the host if we're on sufficiently + * recent machine that we have 1TB segments; + * otherwise default to PPC970FX. + */ vcpu->arch.pvr = 0x3C0301; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + vcpu->arch.pvr = mfspr(SPRN_PVR); #else /* default to book3s_32 (750) */ vcpu->arch.pvr = 0x84202; #endif - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; vcpu->arch.shadow_msr = MSR_USER64; @@ -1096,24 +1225,31 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) uninit_vcpu: kvm_vcpu_uninit(vcpu); free_shadow_vcpu: - kfree(vcpu_book3s->shadow_vcpu); -free_vcpu: +#ifdef CONFIG_KVM_BOOK3S_32 + kfree(vcpu->arch.shadow_vcpu); +free_vcpu3s: +#endif vfree(vcpu_book3s); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); kvm_vcpu_uninit(vcpu); - kfree(vcpu_book3s->shadow_vcpu); +#ifdef CONFIG_KVM_BOOK3S_32 + kfree(vcpu->arch.shadow_vcpu); +#endif vfree(vcpu_book3s); + kmem_cache_free(kvm_vcpu_cache, vcpu); } -int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; struct thread_fp_state fp; @@ -1216,8 +1352,8 @@ out: /* * Get (and clear) the dirty memory log for a memory slot. */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, + struct kvm_dirty_log *log) { struct kvm_memory_slot *memslot; struct kvm_vcpu *vcpu; @@ -1252,67 +1388,100 @@ out: return r; } -#ifdef CONFIG_PPC64 -int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - info->flags = KVM_PPC_1T_SEGMENTS; - - /* SLB is always 64 entries */ - info->slb_size = 64; - - /* Standard 4k base page size segment */ - info->sps[0].page_shift = 12; - info->sps[0].slb_enc = 0; - info->sps[0].enc[0].page_shift = 12; - info->sps[0].enc[0].pte_enc = 0; - - /* Standard 16M large page size segment */ - info->sps[1].page_shift = 24; - info->sps[1].slb_enc = SLB_VSID_L; - info->sps[1].enc[0].page_shift = 24; - info->sps[1].enc[0].pte_enc = 0; + return; +} +static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ return 0; } -#endif /* CONFIG_PPC64 */ -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) { + return; } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) +static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) { - return 0; + return; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) +static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + +#ifdef CONFIG_PPC64 +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { -} + long int i; + struct kvm_vcpu *vcpu; + + info->flags = 0; + + /* SLB is always 64 entries */ + info->slb_size = 64; + + /* Standard 4k base page size segment */ + info->sps[0].page_shift = 12; + info->sps[0].slb_enc = 0; + info->sps[0].enc[0].page_shift = 12; + info->sps[0].enc[0].pte_enc = 0; + + /* + * 64k large page size. + * We only want to put this in if the CPUs we're emulating + * support it, but unfortunately we don't have a vcpu easily + * to hand here to test. Just pick the first vcpu, and if + * that doesn't exist yet, report the minimum capability, + * i.e., no 64k pages. + * 1T segment support goes along with 64k pages. + */ + i = 1; + vcpu = kvm_get_vcpu(kvm, 0); + if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + info->flags = KVM_PPC_1T_SEGMENTS; + info->sps[i].page_shift = 16; + info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01; + info->sps[i].enc[0].page_shift = 16; + info->sps[i].enc[0].pte_enc = 1; + ++i; + } + + /* Standard 16M large page size segment */ + info->sps[i].page_shift = 24; + info->sps[i].slb_enc = SLB_VSID_L; + info->sps[i].enc[0].page_shift = 24; + info->sps[i].enc[0].pte_enc = 0; -void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) + return 0; +} +#else +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { + /* We should not get called */ + BUG(); } +#endif /* CONFIG_PPC64 */ static unsigned int kvm_global_user_count = 0; static DEFINE_SPINLOCK(kvm_global_user_count_lock); -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_pr(struct kvm *kvm) { -#ifdef CONFIG_PPC64 - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - INIT_LIST_HEAD(&kvm->arch.rtas_tokens); -#endif + mutex_init(&kvm->arch.hpt_mutex); if (firmware_has_feature(FW_FEATURE_SET_MODE)) { spin_lock(&kvm_global_user_count_lock); @@ -1323,7 +1492,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) { #ifdef CONFIG_PPC64 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); @@ -1338,26 +1507,81 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) } } -static int kvmppc_book3s_init(void) +static int kvmppc_core_check_processor_compat_pr(void) { - int r; + /* we are always compatible */ + return 0; +} - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, - THIS_MODULE); +static long kvm_arch_vm_ioctl_pr(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -ENOTTY; +} - if (r) +static struct kvmppc_ops kvm_ops_pr = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr, + .get_one_reg = kvmppc_get_one_reg_pr, + .set_one_reg = kvmppc_set_one_reg_pr, + .vcpu_load = kvmppc_core_vcpu_load_pr, + .vcpu_put = kvmppc_core_vcpu_put_pr, + .set_msr = kvmppc_set_msr_pr, + .vcpu_run = kvmppc_vcpu_run_pr, + .vcpu_create = kvmppc_core_vcpu_create_pr, + .vcpu_free = kvmppc_core_vcpu_free_pr, + .check_requests = kvmppc_core_check_requests_pr, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr, + .flush_memslot = kvmppc_core_flush_memslot_pr, + .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, + .commit_memory_region = kvmppc_core_commit_memory_region_pr, + .unmap_hva = kvm_unmap_hva_pr, + .unmap_hva_range = kvm_unmap_hva_range_pr, + .age_hva = kvm_age_hva_pr, + .test_age_hva = kvm_test_age_hva_pr, + .set_spte_hva = kvm_set_spte_hva_pr, + .mmu_destroy = kvmppc_mmu_destroy_pr, + .free_memslot = kvmppc_core_free_memslot_pr, + .create_memslot = kvmppc_core_create_memslot_pr, + .init_vm = kvmppc_core_init_vm_pr, + .destroy_vm = kvmppc_core_destroy_vm_pr, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr, + .emulate_op = kvmppc_core_emulate_op_pr, + .emulate_mtspr = kvmppc_core_emulate_mtspr_pr, + .emulate_mfspr = kvmppc_core_emulate_mfspr_pr, + .fast_vcpu_kick = kvm_vcpu_kick, + .arch_vm_ioctl = kvm_arch_vm_ioctl_pr, +}; + + +int kvmppc_book3s_init_pr(void) +{ + int r; + + r = kvmppc_core_check_processor_compat_pr(); + if (r < 0) return r; - r = kvmppc_mmu_hpte_sysinit(); + kvm_ops_pr.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_pr; + r = kvmppc_mmu_hpte_sysinit(); return r; } -static void kvmppc_book3s_exit(void) +void kvmppc_book3s_exit_pr(void) { + kvmppc_pr_ops = NULL; kvmppc_mmu_hpte_sysexit(); - kvm_exit(); } -module_init(kvmppc_book3s_init); -module_exit(kvmppc_book3s_exit); +/* + * We only support separate modules for book3s 64 + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +module_init(kvmppc_book3s_init_pr); +module_exit(kvmppc_book3s_exit_pr); + +MODULE_LICENSE("GPL"); +#endif diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index da0e0bc268b..5efa97b993d 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -21,6 +21,8 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#define HPTE_SIZE 16 /* bytes per HPT entry */ + static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); @@ -40,32 +42,41 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) long pte_index = kvmppc_get_gpr(vcpu, 5); unsigned long pteg[2 * 8]; unsigned long pteg_addr, i, *hpte; + long int ret; + i = pte_index & 7; pte_index &= ~7UL; pteg_addr = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); hpte = pteg; + ret = H_PTEG_FULL; if (likely((flags & H_EXACT) == 0)) { - pte_index &= ~7UL; for (i = 0; ; ++i) { if (i == 8) - return H_PTEG_FULL; + goto done; if ((*hpte & HPTE_V_VALID) == 0) break; hpte += 2; } } else { - i = kvmppc_get_gpr(vcpu, 5) & 7UL; hpte += i * 2; + if (*hpte & HPTE_V_VALID) + goto done; } hpte[0] = kvmppc_get_gpr(vcpu, 6); hpte[1] = kvmppc_get_gpr(vcpu, 7); - copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg)); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + pteg_addr += i * HPTE_SIZE; + copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); kvmppc_set_gpr(vcpu, 4, pte_index | i); + ret = H_SUCCESS; + + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } @@ -77,26 +88,31 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long v = 0, pteg, rb; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || - ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) + goto done; copy_to_user((void __user *)pteg, &v, sizeof(v)); rb = compute_tlbie_rb(pte[0], pte[1], pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + ret = H_SUCCESS; kvmppc_set_gpr(vcpu, 4, pte[0]); kvmppc_set_gpr(vcpu, 5, pte[1]); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + return EMULATE_DONE; } @@ -124,6 +140,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) int paramnr = 4; int ret = H_SUCCESS; + mutex_lock(&vcpu->kvm->arch.hpt_mutex); for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i)); unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1); @@ -172,6 +189,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) } kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; @@ -184,15 +202,16 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long rb, pteg, r, v; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || - ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) + goto done; v = pte[0]; r = pte[1]; @@ -207,8 +226,11 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) rb = compute_tlbie_rb(v, r, pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); copy_to_user((void __user *)pteg, pte, sizeof(pte)); + ret = H_SUCCESS; - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 8f7633e3afb..a38c4c9edab 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -38,32 +38,6 @@ #define FUNC(name) GLUE(.,name) - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_SRR0 - addi r13, r13, 4 - mtspr SPRN_SRR0, r13 - GET_SCRATCH0(r13) - rfid - b . - - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_HSRR0 - addi r13, r13, 4 - mtspr SPRN_HSRR0, r13 - GET_SCRATCH0(r13) - hrfid - b . - #elif defined(CONFIG_PPC_BOOK3S_32) #define FUNC(name) name @@ -179,11 +153,15 @@ _GLOBAL(kvmppc_entry_trampoline) li r6, MSR_IR | MSR_DR andc r6, r5, r6 /* Clear DR and IR in MSR value */ +#ifdef CONFIG_PPC_BOOK3S_32 /* * Set EE in HOST_MSR so that it's enabled when we get into our - * C exit handler function + * C exit handler function. On 64-bit we delay enabling + * interrupts until we have finished transferring stuff + * to or from the PACA. */ ori r5, r5, MSR_EE +#endif mtsrr0 r7 mtsrr1 r6 RFI diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 3219ba89524..cf95cdef73c 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -260,6 +260,7 @@ fail: */ return rc; } +EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall); void kvmppc_rtas_tokens_free(struct kvm *kvm) { diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 1abe4788191..bc50c97751d 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -161,8 +161,8 @@ kvmppc_handler_trampoline_enter_end: .global kvmppc_handler_trampoline_exit kvmppc_handler_trampoline_exit: -.global kvmppc_interrupt -kvmppc_interrupt: +.global kvmppc_interrupt_pr +kvmppc_interrupt_pr: /* Register usage at this point: * diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index a3a5cb8ee7e..02a17dcf161 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -818,7 +818,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) } /* Check for real mode returning too hard */ - if (xics->real_mode) + if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm)) return kvmppc_xics_rm_complete(vcpu, req); switch (req) { @@ -840,6 +840,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) return rc; } +EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); /* -- Initialisation code etc. -- */ @@ -1250,13 +1251,13 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) xics_debugfs_init(xics); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (cpu_has_feature(CPU_FTR_ARCH_206)) { /* Enable real mode support */ xics->real_mode = ENABLE_REALMODE; xics->real_mode_dbg = DEBUG_REALMODE; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 5133199f6cb..53e65a210b9 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -40,7 +40,9 @@ #include "timing.h" #include "booke.h" -#include "trace.h" + +#define CREATE_TRACE_POINTS +#include "trace_booke.h" unsigned long kvmppc_booke_handlers; @@ -133,6 +135,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + /* Synchronize guest's desire to get debug interrupts into shadow MSR */ +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr &= ~MSR_DE; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; +#endif + + /* Force enable debug interrupts when user space wants to debug */ + if (vcpu->guest_debug) { +#ifdef CONFIG_KVM_BOOKE_HV + /* + * Since there is no shadow MSR, sync MSR_DE into the guest + * visible MSR. + */ + vcpu->arch.shared->msr |= MSR_DE; +#else + vcpu->arch.shadow_msr |= MSR_DE; + vcpu->arch.shared->msr &= ~MSR_DE; +#endif + } +} + /* * Helper function for "full" MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -150,6 +175,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) kvmppc_mmu_msr_notify(vcpu, old_msr); kvmppc_vcpu_sync_spe(vcpu); kvmppc_vcpu_sync_fpu(vcpu); + kvmppc_vcpu_sync_debug(vcpu); } static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, @@ -655,6 +681,7 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret, s; + struct thread_struct thread; #ifdef CONFIG_PPC_FPU struct thread_fp_state fp; int fpexc_mode; @@ -695,6 +722,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_load_guest_fp(vcpu); #endif + /* Switch to guest debug context */ + thread.debug = vcpu->arch.shadow_dbg_reg; + switch_booke_debug_regs(&thread); + thread.debug = current->thread.debug; + current->thread.debug = vcpu->arch.shadow_dbg_reg; + kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); @@ -702,6 +735,10 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* No need for kvm_guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ + /* Switch back to user space debug context */ + switch_booke_debug_regs(&thread); + current->thread.debug = thread.debug; + #ifdef CONFIG_PPC_FPU kvmppc_save_guest_fp(vcpu); @@ -757,6 +794,30 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) } } +static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + struct debug_reg *dbg_reg = &(vcpu->arch.shadow_dbg_reg); + u32 dbsr = vcpu->arch.dbsr; + + run->debug.arch.status = 0; + run->debug.arch.address = vcpu->arch.pc; + + if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) { + run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT; + } else { + if (dbsr & (DBSR_DAC1W | DBSR_DAC2W)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE; + else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ; + if (dbsr & (DBSR_DAC1R | DBSR_DAC1W)) + run->debug.arch.address = dbg_reg->dac1; + else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W)) + run->debug.arch.address = dbg_reg->dac2; + } + + return RESUME_HOST; +} + static void kvmppc_fill_pt_regs(struct pt_regs *regs) { ulong r1, ip, msr, lr; @@ -817,6 +878,11 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, case BOOKE_INTERRUPT_CRITICAL: unknown_exception(®s); break; + case BOOKE_INTERRUPT_DEBUG: + /* Save DBSR before preemption is enabled */ + vcpu->arch.dbsr = mfspr(SPRN_DBSR); + kvmppc_clear_dbsr(); + break; } } @@ -1134,18 +1200,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } case BOOKE_INTERRUPT_DEBUG: { - u32 dbsr; - - vcpu->arch.pc = mfspr(SPRN_CSRR0); - - /* clear IAC events in DBSR register */ - dbsr = mfspr(SPRN_DBSR); - dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4; - mtspr(SPRN_DBSR, dbsr); - - run->exit_reason = KVM_EXIT_DEBUG; + r = kvmppc_handle_debug(run, vcpu); + if (r == RESUME_HOST) + run->exit_reason = KVM_EXIT_DEBUG; kvmppc_account_exit(vcpu, DEBUG_EXITS); - r = RESUME_HOST; break; } @@ -1196,7 +1254,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, 0); #ifndef CONFIG_KVM_BOOKE_HV - vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; vcpu->arch.shadow_pid = 1; vcpu->arch.shared->msr = 0; #endif @@ -1358,7 +1416,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { sregs->u.e.features |= KVM_SREGS_E_IVOR; @@ -1378,6 +1436,7 @@ void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + return 0; } int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -1412,8 +1471,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, get_sregs_base(vcpu, sregs); get_sregs_arch206(vcpu, sregs); - kvmppc_core_get_sregs(vcpu, sregs); - return 0; + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, @@ -1432,7 +1490,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (ret < 0) return ret; - return kvmppc_core_set_sregs(vcpu, sregs); + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); } int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) @@ -1440,7 +1498,6 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int r = 0; union kvmppc_one_reg val; int size; - long int i; size = one_reg_size(reg->id); if (size > sizeof(val)) @@ -1448,16 +1505,24 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) switch (reg->id) { case KVM_REG_PPC_IAC1: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac1); + break; case KVM_REG_PPC_IAC2: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac2); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac3); + break; case KVM_REG_PPC_IAC4: - i = reg->id - KVM_REG_PPC_IAC1; - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]); + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac4); break; +#endif case KVM_REG_PPC_DAC1: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac1); + break; case KVM_REG_PPC_DAC2: - i = reg->id - KVM_REG_PPC_DAC1; - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]); + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac2); break; case KVM_REG_PPC_EPR: { u32 epr = get_guest_epr(vcpu); @@ -1476,10 +1541,13 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) val = get_reg_val(reg->id, vcpu->arch.tsr); break; case KVM_REG_PPC_DEBUG_INST: - val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV); + val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV_DEBUG); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); break; default: - r = kvmppc_get_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); break; } @@ -1497,7 +1565,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int r = 0; union kvmppc_one_reg val; int size; - long int i; size = one_reg_size(reg->id); if (size > sizeof(val)) @@ -1508,16 +1575,24 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) switch (reg->id) { case KVM_REG_PPC_IAC1: + vcpu->arch.dbg_reg.iac1 = set_reg_val(reg->id, val); + break; case KVM_REG_PPC_IAC2: + vcpu->arch.dbg_reg.iac2 = set_reg_val(reg->id, val); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: + vcpu->arch.dbg_reg.iac3 = set_reg_val(reg->id, val); + break; case KVM_REG_PPC_IAC4: - i = reg->id - KVM_REG_PPC_IAC1; - vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.iac4 = set_reg_val(reg->id, val); break; +#endif case KVM_REG_PPC_DAC1: + vcpu->arch.dbg_reg.dac1 = set_reg_val(reg->id, val); + break; case KVM_REG_PPC_DAC2: - i = reg->id - KVM_REG_PPC_DAC1; - vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.dac2 = set_reg_val(reg->id, val); break; case KVM_REG_PPC_EPR: { u32 new_epr = set_reg_val(reg->id, val); @@ -1551,20 +1626,17 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) kvmppc_set_tcr(vcpu, tcr); break; } + case KVM_REG_PPC_VRSAVE: + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; default: - r = kvmppc_set_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); break; } return r; } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; @@ -1589,12 +1661,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return -ENOTSUPP; } -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { return 0; @@ -1670,6 +1742,157 @@ void kvmppc_decrementer_func(unsigned long data) kvmppc_set_tsr_bits(vcpu, TSR_DIS); } +static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg, + uint64_t addr, int index) +{ + switch (index) { + case 0: + dbg_reg->dbcr0 |= DBCR0_IAC1; + dbg_reg->iac1 = addr; + break; + case 1: + dbg_reg->dbcr0 |= DBCR0_IAC2; + dbg_reg->iac2 = addr; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case 2: + dbg_reg->dbcr0 |= DBCR0_IAC3; + dbg_reg->iac3 = addr; + break; + case 3: + dbg_reg->dbcr0 |= DBCR0_IAC4; + dbg_reg->iac4 = addr; + break; +#endif + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} + +static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr, + int type, int index) +{ + switch (index) { + case 0: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC1R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC1W; + dbg_reg->dac1 = addr; + break; + case 1: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC2R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC2W; + dbg_reg->dac2 = addr; + break; + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} +void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set) +{ + /* XXX: Add similar MSR protection for BookE-PR */ +#ifdef CONFIG_KVM_BOOKE_HV + BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP)); + if (set) { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp |= MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp |= MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp |= MSRP_PMMP; + } else { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp &= ~MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp &= ~MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp &= ~MSRP_PMMP; + } +#endif +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + struct debug_reg *dbg_reg; + int n, b = 0, w = 0; + + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + vcpu->guest_debug = 0; + kvm_guest_protect_msr(vcpu, MSR_DE, false); + return 0; + } + + kvm_guest_protect_msr(vcpu, MSR_DE, true); + vcpu->guest_debug = dbg->control; + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + /* Set DBCR0_EDM in guest visible DBCR0 register. */ + vcpu->arch.dbg_reg.dbcr0 = DBCR0_EDM; + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.shadow_dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; + + /* Code below handles only HW breakpoints */ + dbg_reg = &(vcpu->arch.shadow_dbg_reg); + +#ifdef CONFIG_KVM_BOOKE_HV + /* + * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1 + * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0 + */ + dbg_reg->dbcr1 = 0; + dbg_reg->dbcr2 = 0; +#else + /* + * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1 + * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR + * is set. + */ + dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US | + DBCR1_IAC4US; + dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; +#endif + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + return 0; + + for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { + uint64_t addr = dbg->arch.bp[n].addr; + uint32_t type = dbg->arch.bp[n].type; + + if (type == KVMPPC_DEBUG_NONE) + continue; + + if (type & !(KVMPPC_DEBUG_WATCH_READ | + KVMPPC_DEBUG_WATCH_WRITE | + KVMPPC_DEBUG_BREAKPOINT)) + return -EINVAL; + + if (type & KVMPPC_DEBUG_BREAKPOINT) { + /* Setting H/W breakpoint */ + if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) + return -EINVAL; + } else { + /* Setting H/W watchpoint */ + if (kvmppc_booke_add_watchpoint(dbg_reg, addr, + type, w++)) + return -EINVAL; + } + } + + return 0; +} + void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { vcpu->cpu = smp_processor_id(); @@ -1680,6 +1903,44 @@ void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) { current->thread.kvm_vcpu = NULL; vcpu->cpu = -1; + + /* Clear pending debug event in DBSR */ + kvmppc_clear_dbsr(); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return kvm->arch.kvm_ops->init_vm(kvm); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); } int __init kvmppc_booke_init(void) diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 5fd1ba69357..09bfd9bc7cf 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -99,6 +99,30 @@ enum int_class { void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); +extern void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); + /* * Load up guest vcpu FP state if it's needed. * It also set the MSR_FP in thread so that host know @@ -129,4 +153,9 @@ static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) giveup_fpu(current); #endif } + +static inline void kvmppc_clear_dbsr(void) +{ + mtspr(SPRN_DBSR, mfspr(SPRN_DBSR)); +} #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index ce6b73c2961..497b142f651 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -305,7 +305,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu) { kvmppc_booke_vcpu_load(vcpu, cpu); @@ -313,7 +313,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SPE if (vcpu->arch.shadow_msr & MSR_SPE) @@ -367,7 +367,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -388,9 +389,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvmppc_get_sregs_ivor(vcpu, sregs); kvmppc_get_sregs_e500_tlb(vcpu, sregs); + return 0; } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int ret; @@ -425,21 +428,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); return r; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; struct kvm_vcpu *vcpu; @@ -481,7 +485,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -492,15 +496,32 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_e500(struct kvm *kvm) { return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_e500(struct kvm *kvm) { } +static struct kvmppc_ops kvm_ops_e500 = { + .get_sregs = kvmppc_core_get_sregs_e500, + .set_sregs = kvmppc_core_set_sregs_e500, + .get_one_reg = kvmppc_get_one_reg_e500, + .set_one_reg = kvmppc_set_one_reg_e500, + .vcpu_load = kvmppc_core_vcpu_load_e500, + .vcpu_put = kvmppc_core_vcpu_put_e500, + .vcpu_create = kvmppc_core_vcpu_create_e500, + .vcpu_free = kvmppc_core_vcpu_free_e500, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500, + .destroy_vm = kvmppc_core_destroy_vm_e500, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + static int __init kvmppc_e500_init(void) { int r, i; @@ -512,11 +533,11 @@ static int __init kvmppc_e500_init(void) r = kvmppc_core_check_processor_compat(); if (r) - return r; + goto err_out; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; /* copy extra E500 exception handlers */ ivor[0] = mfspr(SPRN_IVOR32); @@ -534,11 +555,19 @@ static int __init kvmppc_e500_init(void) flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + ivor[max_ivor] + handler_len); - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500; + +err_out: + return r; } static void __exit kvmppc_e500_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index c2e5e98453a..4fd9650eb01 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -117,7 +117,7 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) #define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) #define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) #define MAS2_ATTRIB_MASK \ - (MAS2_X0 | MAS2_X1) + (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G) #define MAS3_ATTRIB_MASK \ (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index b10a01243ab..89b7f821f6c 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -26,6 +26,7 @@ #define XOP_TLBRE 946 #define XOP_TLBWE 978 #define XOP_TLBILX 18 +#define XOP_EHPRIV 270 #ifdef CONFIG_KVM_E500MC static int dbell2prio(ulong param) @@ -82,8 +83,28 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) } #endif -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = vcpu->arch.pc; + run->debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated = EMULATE_EXIT_USER; + *advance = 0; + break; + default: + emulated = EMULATE_FAIL; + } + return emulated; +} + +int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int ra = get_ra(inst); @@ -130,6 +151,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_EHPRIV: + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, + advance); + break; + default: emulated = EMULATE_FAIL; } @@ -146,7 +172,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; @@ -237,7 +263,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 6d6f153b6c1..ebca6b88ea5 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -32,7 +32,7 @@ #include <asm/kvm_ppc.h> #include "e500.h" -#include "trace.h" +#include "trace_booke.h" #include "timing.h" #include "e500_mmu_host.h" @@ -536,7 +536,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c65593abae8..ecf2247b13b 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -32,10 +32,11 @@ #include <asm/kvm_ppc.h> #include "e500.h" -#include "trace.h" #include "timing.h" #include "e500_mmu_host.h" +#include "trace_booke.h" + #define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; @@ -253,6 +254,9 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, ref->pfn = pfn; ref->flags |= E500_TLB_VALID; + /* Mark the page accessed */ + kvm_set_pfn_accessed(pfn); + if (tlbe_is_writable(gtlbe)) kvm_set_pfn_dirty(pfn); } diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 19c8379575f..4132cd2fc17 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -110,7 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -147,7 +147,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvmppc_load_guest_fp(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) { vcpu->arch.eplc = mfspr(SPRN_EPLC); vcpu->arch.epsc = mfspr(SPRN_EPSC); @@ -204,7 +204,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -224,10 +225,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int ret; @@ -260,21 +262,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); return r; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; struct kvm_vcpu *vcpu; @@ -315,7 +318,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -325,7 +328,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) { int lpid; @@ -337,27 +340,52 @@ int kvmppc_core_init_vm(struct kvm *kvm) return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm) { kvmppc_free_lpid(kvm->arch.lpid); } +static struct kvmppc_ops kvm_ops_e500mc = { + .get_sregs = kvmppc_core_get_sregs_e500mc, + .set_sregs = kvmppc_core_set_sregs_e500mc, + .get_one_reg = kvmppc_get_one_reg_e500mc, + .set_one_reg = kvmppc_set_one_reg_e500mc, + .vcpu_load = kvmppc_core_vcpu_load_e500mc, + .vcpu_put = kvmppc_core_vcpu_put_e500mc, + .vcpu_create = kvmppc_core_vcpu_create_e500mc, + .vcpu_free = kvmppc_core_vcpu_free_e500mc, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500mc, + .destroy_vm = kvmppc_core_destroy_vm_e500mc, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + static int __init kvmppc_e500mc_init(void) { int r; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; kvmppc_init_lpid(64); kvmppc_claim_lpid(0); /* host */ - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500mc.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500mc; + +err_out: + return r; } static void __exit kvmppc_e500mc_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 751cd45f65a..2f9a0873b44 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -130,8 +130,8 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_PIR: break; default: - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, - spr_val); + emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn, + spr_val); if (emulated == EMULATE_FAIL) printk(KERN_INFO "mtspr: unknown spr " "0x%x\n", sprn); @@ -191,8 +191,8 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) spr_val = kvmppc_get_dec(vcpu, get_tb()); break; default: - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, - &spr_val); + emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn, + &spr_val); if (unlikely(emulated == EMULATE_FAIL)) { printk(KERN_INFO "mfspr: unknown spr " "0x%x\n", sprn); @@ -464,7 +464,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } if (emulated == EMULATE_FAIL) { - emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance); + emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst, + &advance); if (emulated == EMULATE_AGAIN) { advance = 0; } else if (emulated == EMULATE_FAIL) { @@ -483,3 +484,4 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) return emulated; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 07c0106fab7..9ae97686e9f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/module.h> #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> @@ -39,6 +40,12 @@ #define CREATE_TRACE_POINTS #include "trace.h" +struct kvmppc_ops *kvmppc_hv_ops; +EXPORT_SYMBOL_GPL(kvmppc_hv_ops); +struct kvmppc_ops *kvmppc_pr_ops; +EXPORT_SYMBOL_GPL(kvmppc_pr_ops); + + int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return !!(v->arch.pending_exceptions) || @@ -50,7 +57,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } -#ifndef CONFIG_KVM_BOOK3S_64_HV /* * Common checks before entering the guest world. Call with interrupts * disabled. @@ -125,7 +131,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) return r; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter); int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { @@ -179,6 +185,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) return r; } +EXPORT_SYMBOL_GPL(kvmppc_kvm_pv); int kvmppc_sanity_check(struct kvm_vcpu *vcpu) { @@ -192,11 +199,9 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) goto out; -#ifdef CONFIG_KVM_BOOK3S_64_HV /* HV KVM can only do PAPR mode for now */ - if (!vcpu->arch.papr_enabled) + if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm)) goto out; -#endif #ifdef CONFIG_KVM_BOOKE_HV if (!cpu_has_feature(CPU_FTR_EMB_HV)) @@ -209,6 +214,7 @@ out: vcpu->arch.sane = r; return r ? 0 : -EINVAL; } +EXPORT_SYMBOL_GPL(kvmppc_sanity_check); int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -243,6 +249,7 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) return r; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio); int kvm_arch_hardware_enable(void *garbage) { @@ -269,10 +276,35 @@ void kvm_arch_check_processor_compat(void *rtn) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - if (type) - return -EINVAL; - + struct kvmppc_ops *kvm_ops = NULL; + /* + * if we have both HV and PR enabled, default is HV + */ + if (type == 0) { + if (kvmppc_hv_ops) + kvm_ops = kvmppc_hv_ops; + else + kvm_ops = kvmppc_pr_ops; + if (!kvm_ops) + goto err_out; + } else if (type == KVM_VM_PPC_HV) { + if (!kvmppc_hv_ops) + goto err_out; + kvm_ops = kvmppc_hv_ops; + } else if (type == KVM_VM_PPC_PR) { + if (!kvmppc_pr_ops) + goto err_out; + kvm_ops = kvmppc_pr_ops; + } else + goto err_out; + + if (kvm_ops->owner && !try_module_get(kvm_ops->owner)) + return -ENOENT; + + kvm->arch.kvm_ops = kvm_ops; return kvmppc_core_init_vm(kvm); +err_out: + return -EINVAL; } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -292,6 +324,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvmppc_core_destroy_vm(kvm); mutex_unlock(&kvm->lock); + + /* drop the module reference */ + module_put(kvm->arch.kvm_ops->owner); } void kvm_arch_sync_events(struct kvm *kvm) @@ -301,6 +336,10 @@ void kvm_arch_sync_events(struct kvm *kvm) int kvm_dev_ioctl_check_extension(long ext) { int r; + /* FIXME!! + * Should some of this be vm ioctl ? is it possible now ? + */ + int hv_enabled = kvmppc_hv_ops ? 1 : 0; switch (ext) { #ifdef CONFIG_BOOKE @@ -320,22 +359,26 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_DEVICE_CTRL: r = 1; break; -#ifndef CONFIG_KVM_BOOK3S_64_HV case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: #endif -#ifdef CONFIG_KVM_MPIC - case KVM_CAP_IRQ_MPIC: -#endif - r = 1; + /* We support this only for PR */ + r = !hv_enabled; break; +#ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; #endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: + r = 1; + break; +#endif + #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_PPC_ALLOC_HTAB: @@ -346,32 +389,37 @@ int kvm_dev_ioctl_check_extension(long ext) r = 1; break; #endif /* CONFIG_PPC_BOOK3S_64 */ -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: - r = threads_per_core; + if (hv_enabled) + r = threads_per_core; + else + r = 0; break; case KVM_CAP_PPC_RMA: - r = 1; + r = hv_enabled; /* PPC970 requires an RMA */ - if (cpu_has_feature(CPU_FTR_ARCH_201)) + if (r && cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; #endif case KVM_CAP_SYNC_MMU: -#ifdef CONFIG_KVM_BOOK3S_64_HV - r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (hv_enabled) + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + else + r = 0; #elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) r = 1; #else r = 0; - break; #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV + break; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_HTAB_FD: - r = 1; + r = hv_enabled; break; #endif - break; case KVM_CAP_NR_VCPUS: /* * Recommending a number of CPUs is somewhat arbitrary; we @@ -379,11 +427,10 @@ int kvm_dev_ioctl_check_extension(long ext) * will have secondary threads "offline"), and for other KVM * implementations just count online CPUs. */ -#ifdef CONFIG_KVM_BOOK3S_64_HV - r = num_present_cpus(); -#else - r = num_online_cpus(); -#endif + if (hv_enabled) + r = num_present_cpus(); + else + r = num_online_cpus(); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; @@ -407,15 +454,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - kvmppc_core_free_memslot(free, dont); + kvmppc_core_free_memslot(kvm, free, dont); } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { - return kvmppc_core_create_memslot(slot, npages); + return kvmppc_core_create_memslot(kvm, slot, npages); } void kvm_arch_memslots_updated(struct kvm *kvm) @@ -659,6 +707,7 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_load); /* Same as above, but sign extends */ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, @@ -720,6 +769,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_store); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -1024,52 +1074,12 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); goto out; } -#endif /* CONFIG_PPC_BOOK3S_64 */ - -#ifdef CONFIG_KVM_BOOK3S_64_HV - case KVM_ALLOCATE_RMA: { - struct kvm_allocate_rma rma; - struct kvm *kvm = filp->private_data; - - r = kvm_vm_ioctl_allocate_rma(kvm, &rma); - if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) - r = -EFAULT; - break; - } - - case KVM_PPC_ALLOCATE_HTAB: { - u32 htab_order; - - r = -EFAULT; - if (get_user(htab_order, (u32 __user *)argp)) - break; - r = kvmppc_alloc_reset_hpt(kvm, &htab_order); - if (r) - break; - r = -EFAULT; - if (put_user(htab_order, (u32 __user *)argp)) - break; - r = 0; - break; - } - - case KVM_PPC_GET_HTAB_FD: { - struct kvm_get_htab_fd ghf; - - r = -EFAULT; - if (copy_from_user(&ghf, argp, sizeof(ghf))) - break; - r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); - break; - } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ - -#ifdef CONFIG_PPC_BOOK3S_64 case KVM_PPC_GET_SMMU_INFO: { struct kvm_ppc_smmu_info info; + struct kvm *kvm = filp->private_data; memset(&info, 0, sizeof(info)); - r = kvm_vm_ioctl_get_smmu_info(kvm, &info); + r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info); if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) r = -EFAULT; break; @@ -1080,11 +1090,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_rtas_define_token(kvm, argp); break; } -#endif /* CONFIG_PPC_BOOK3S_64 */ + default: { + struct kvm *kvm = filp->private_data; + r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); + } +#else /* CONFIG_PPC_BOOK3S_64 */ default: r = -ENOTTY; +#endif } - out: return r; } @@ -1106,22 +1120,26 @@ long kvmppc_alloc_lpid(void) return lpid; } +EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); void kvmppc_claim_lpid(long lpid) { set_bit(lpid, lpid_inuse); } +EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); void kvmppc_free_lpid(long lpid) { clear_bit(lpid, lpid_inuse); } +EXPORT_SYMBOL_GPL(kvmppc_free_lpid); void kvmppc_init_lpid(unsigned long nr_lpids_param) { nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); memset(lpid_inuse, 0, sizeof(lpid_inuse)); } +EXPORT_SYMBOL_GPL(kvmppc_init_lpid); int kvm_arch_init(void *opaque) { @@ -1130,4 +1148,5 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { + } diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index e326489a542..2e0e67ef354 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -31,126 +31,6 @@ TRACE_EVENT(kvm_ppc_instr, __entry->inst, __entry->pc, __entry->emulate) ); -#ifdef CONFIG_PPC_BOOK3S -#define kvm_trace_symbol_exit \ - {0x100, "SYSTEM_RESET"}, \ - {0x200, "MACHINE_CHECK"}, \ - {0x300, "DATA_STORAGE"}, \ - {0x380, "DATA_SEGMENT"}, \ - {0x400, "INST_STORAGE"}, \ - {0x480, "INST_SEGMENT"}, \ - {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ - {0x502, "EXTERNAL_HV"}, \ - {0x600, "ALIGNMENT"}, \ - {0x700, "PROGRAM"}, \ - {0x800, "FP_UNAVAIL"}, \ - {0x900, "DECREMENTER"}, \ - {0x980, "HV_DECREMENTER"}, \ - {0xc00, "SYSCALL"}, \ - {0xd00, "TRACE"}, \ - {0xe00, "H_DATA_STORAGE"}, \ - {0xe20, "H_INST_STORAGE"}, \ - {0xe40, "H_EMUL_ASSIST"}, \ - {0xf00, "PERFMON"}, \ - {0xf20, "ALTIVEC"}, \ - {0xf40, "VSX"} -#else -#define kvm_trace_symbol_exit \ - {0, "CRITICAL"}, \ - {1, "MACHINE_CHECK"}, \ - {2, "DATA_STORAGE"}, \ - {3, "INST_STORAGE"}, \ - {4, "EXTERNAL"}, \ - {5, "ALIGNMENT"}, \ - {6, "PROGRAM"}, \ - {7, "FP_UNAVAIL"}, \ - {8, "SYSCALL"}, \ - {9, "AP_UNAVAIL"}, \ - {10, "DECREMENTER"}, \ - {11, "FIT"}, \ - {12, "WATCHDOG"}, \ - {13, "DTLB_MISS"}, \ - {14, "ITLB_MISS"}, \ - {15, "DEBUG"}, \ - {32, "SPE_UNAVAIL"}, \ - {33, "SPE_FP_DATA"}, \ - {34, "SPE_FP_ROUND"}, \ - {35, "PERFORMANCE_MONITOR"}, \ - {36, "DOORBELL"}, \ - {37, "DOORBELL_CRITICAL"}, \ - {38, "GUEST_DBELL"}, \ - {39, "GUEST_DBELL_CRIT"}, \ - {40, "HV_SYSCALL"}, \ - {41, "HV_PRIV"} -#endif - -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), - TP_ARGS(exit_nr, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, exit_nr ) - __field( unsigned long, pc ) - __field( unsigned long, msr ) - __field( unsigned long, dar ) -#ifdef CONFIG_KVM_BOOK3S_PR - __field( unsigned long, srr1 ) -#endif - __field( unsigned long, last_inst ) - ), - - TP_fast_assign( -#ifdef CONFIG_KVM_BOOK3S_PR - struct kvmppc_book3s_shadow_vcpu *svcpu; -#endif - __entry->exit_nr = exit_nr; - __entry->pc = kvmppc_get_pc(vcpu); - __entry->dar = kvmppc_get_fault_dar(vcpu); - __entry->msr = vcpu->arch.shared->msr; -#ifdef CONFIG_KVM_BOOK3S_PR - svcpu = svcpu_get(vcpu); - __entry->srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); -#endif - __entry->last_inst = vcpu->arch.last_inst; - ), - - TP_printk("exit=%s" - " | pc=0x%lx" - " | msr=0x%lx" - " | dar=0x%lx" -#ifdef CONFIG_KVM_BOOK3S_PR - " | srr1=0x%lx" -#endif - " | last_inst=0x%lx" - , - __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), - __entry->pc, - __entry->msr, - __entry->dar, -#ifdef CONFIG_KVM_BOOK3S_PR - __entry->srr1, -#endif - __entry->last_inst - ) -); - -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - TRACE_EVENT(kvm_stlb_inval, TP_PROTO(unsigned int stlb_index), TP_ARGS(stlb_index), @@ -236,315 +116,6 @@ TRACE_EVENT(kvm_check_requests, __entry->cpu_nr, __entry->requests) ); - -/************************************************************************* - * Book3S trace points * - *************************************************************************/ - -#ifdef CONFIG_KVM_BOOK3S_PR - -TRACE_EVENT(kvm_book3s_reenter, - TP_PROTO(int r, struct kvm_vcpu *vcpu), - TP_ARGS(r, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, r ) - __field( unsigned long, pc ) - ), - - TP_fast_assign( - __entry->r = r; - __entry->pc = kvmppc_get_pc(vcpu); - ), - - TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) -); - -#ifdef CONFIG_PPC_BOOK3S_64 - -TRACE_EVENT(kvm_book3s_64_mmu_map, - TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, - struct kvmppc_pte *orig_pte), - TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), - - TP_STRUCT__entry( - __field( unsigned char, flag_w ) - __field( unsigned char, flag_x ) - __field( unsigned long, eaddr ) - __field( unsigned long, hpteg ) - __field( unsigned long, va ) - __field( unsigned long long, vpage ) - __field( unsigned long, hpaddr ) - ), - - TP_fast_assign( - __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; - __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; - __entry->eaddr = orig_pte->eaddr; - __entry->hpteg = hpteg; - __entry->va = va; - __entry->vpage = orig_pte->vpage; - __entry->hpaddr = hpaddr; - ), - - TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", - __entry->flag_w, __entry->flag_x, __entry->eaddr, - __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) -); - -#endif /* CONFIG_PPC_BOOK3S_64 */ - -TRACE_EVENT(kvm_book3s_mmu_map, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_vpn ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_vpn = pte->host_vpn; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_vpn, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_invalidate, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_vpn ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_vpn = pte->host_vpn; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_vpn, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_flush, - TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, - unsigned long long p2), - TP_ARGS(type, vcpu, p1, p2), - - TP_STRUCT__entry( - __field( int, count ) - __field( unsigned long long, p1 ) - __field( unsigned long long, p2 ) - __field( const char *, type ) - ), - - TP_fast_assign( - __entry->count = to_book3s(vcpu)->hpte_cache_count; - __entry->p1 = p1; - __entry->p2 = p2; - __entry->type = type; - ), - - TP_printk("Flush %d %sPTEs: %llx - %llx", - __entry->count, __entry->type, __entry->p1, __entry->p2) -); - -TRACE_EVENT(kvm_book3s_slb_found, - TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), - TP_ARGS(gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned long long, gvsid ) - __field( unsigned long long, hvsid ) - ), - - TP_fast_assign( - __entry->gvsid = gvsid; - __entry->hvsid = hvsid; - ), - - TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) -); - -TRACE_EVENT(kvm_book3s_slb_fail, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), - TP_ARGS(sid_map_mask, gvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, gvsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->gvsid = gvsid; - ), - - TP_printk("%x/%x: %llx", __entry->sid_map_mask, - SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) -); - -TRACE_EVENT(kvm_book3s_slb_map, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, - unsigned long long hvsid), - TP_ARGS(sid_map_mask, gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, guest_vsid ) - __field( unsigned long long, host_vsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->guest_vsid = gvsid; - __entry->host_vsid = hvsid; - ), - - TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, - __entry->guest_vsid, __entry->host_vsid) -); - -TRACE_EVENT(kvm_book3s_slbmte, - TP_PROTO(u64 slb_vsid, u64 slb_esid), - TP_ARGS(slb_vsid, slb_esid), - - TP_STRUCT__entry( - __field( u64, slb_vsid ) - __field( u64, slb_esid ) - ), - - TP_fast_assign( - __entry->slb_vsid = slb_vsid; - __entry->slb_esid = slb_esid; - ), - - TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) -); - -#endif /* CONFIG_PPC_BOOK3S */ - - -/************************************************************************* - * Book3E trace points * - *************************************************************************/ - -#ifdef CONFIG_BOOKE - -TRACE_EVENT(kvm_booke206_stlb_write, - TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), - TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), - - TP_STRUCT__entry( - __field( __u32, mas0 ) - __field( __u32, mas8 ) - __field( __u32, mas1 ) - __field( __u64, mas2 ) - __field( __u64, mas7_3 ) - ), - - TP_fast_assign( - __entry->mas0 = mas0; - __entry->mas8 = mas8; - __entry->mas1 = mas1; - __entry->mas2 = mas2; - __entry->mas7_3 = mas7_3; - ), - - TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", - __entry->mas0, __entry->mas8, __entry->mas1, - __entry->mas2, __entry->mas7_3) -); - -TRACE_EVENT(kvm_booke206_gtlb_write, - TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), - TP_ARGS(mas0, mas1, mas2, mas7_3), - - TP_STRUCT__entry( - __field( __u32, mas0 ) - __field( __u32, mas1 ) - __field( __u64, mas2 ) - __field( __u64, mas7_3 ) - ), - - TP_fast_assign( - __entry->mas0 = mas0; - __entry->mas1 = mas1; - __entry->mas2 = mas2; - __entry->mas7_3 = mas7_3; - ), - - TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", - __entry->mas0, __entry->mas1, - __entry->mas2, __entry->mas7_3) -); - -TRACE_EVENT(kvm_booke206_ref_release, - TP_PROTO(__u64 pfn, __u32 flags), - TP_ARGS(pfn, flags), - - TP_STRUCT__entry( - __field( __u64, pfn ) - __field( __u32, flags ) - ), - - TP_fast_assign( - __entry->pfn = pfn; - __entry->flags = flags; - ), - - TP_printk("pfn=%llx flags=%x", - __entry->pfn, __entry->flags) -); - -TRACE_EVENT(kvm_booke_queue_irqprio, - TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), - TP_ARGS(vcpu, priority), - - TP_STRUCT__entry( - __field( __u32, cpu_nr ) - __field( __u32, priority ) - __field( unsigned long, pending ) - ), - - TP_fast_assign( - __entry->cpu_nr = vcpu->vcpu_id; - __entry->priority = priority; - __entry->pending = vcpu->arch.pending_exceptions; - ), - - TP_printk("vcpu=%x prio=%x pending=%lx", - __entry->cpu_nr, __entry->priority, __entry->pending) -); - -#endif - #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h new file mode 100644 index 00000000000..f7537cf26ce --- /dev/null +++ b/arch/powerpc/kvm/trace_booke.h @@ -0,0 +1,177 @@ +#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_BOOKE_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_booke +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_booke + +#define kvm_trace_symbol_exit \ + {0, "CRITICAL"}, \ + {1, "MACHINE_CHECK"}, \ + {2, "DATA_STORAGE"}, \ + {3, "INST_STORAGE"}, \ + {4, "EXTERNAL"}, \ + {5, "ALIGNMENT"}, \ + {6, "PROGRAM"}, \ + {7, "FP_UNAVAIL"}, \ + {8, "SYSCALL"}, \ + {9, "AP_UNAVAIL"}, \ + {10, "DECREMENTER"}, \ + {11, "FIT"}, \ + {12, "WATCHDOG"}, \ + {13, "DTLB_MISS"}, \ + {14, "ITLB_MISS"}, \ + {15, "DEBUG"}, \ + {32, "SPE_UNAVAIL"}, \ + {33, "SPE_FP_DATA"}, \ + {34, "SPE_FP_ROUND"}, \ + {35, "PERFORMANCE_MONITOR"}, \ + {36, "DOORBELL"}, \ + {37, "DOORBELL_CRITICAL"}, \ + {38, "GUEST_DBELL"}, \ + {39, "GUEST_DBELL_CRIT"}, \ + {40, "HV_SYSCALL"}, \ + {41, "HV_PRIV"} + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_ref_release, + TP_PROTO(__u64 pfn, __u32 flags), + TP_ARGS(pfn, flags), + + TP_STRUCT__entry( + __field( __u64, pfn ) + __field( __u32, flags ) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->flags = flags; + ), + + TP_printk("pfn=%llx flags=%x", + __entry->pfn, __entry->flags) +); + +TRACE_EVENT(kvm_booke_queue_irqprio, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), + TP_ARGS(vcpu, priority), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, priority ) + __field( unsigned long, pending ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->priority = priority; + __entry->pending = vcpu->arch.pending_exceptions; + ), + + TP_printk("vcpu=%x prio=%x pending=%lx", + __entry->cpu_nr, __entry->priority, __entry->pending) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h new file mode 100644 index 00000000000..8b22e474834 --- /dev/null +++ b/arch/powerpc/kvm/trace_pr.h @@ -0,0 +1,297 @@ + +#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_PR_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_pr +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_pr + +#define kvm_trace_symbol_exit \ + {0x100, "SYSTEM_RESET"}, \ + {0x200, "MACHINE_CHECK"}, \ + {0x300, "DATA_STORAGE"}, \ + {0x380, "DATA_SEGMENT"}, \ + {0x400, "INST_STORAGE"}, \ + {0x480, "INST_SEGMENT"}, \ + {0x500, "EXTERNAL"}, \ + {0x501, "EXTERNAL_LEVEL"}, \ + {0x502, "EXTERNAL_HV"}, \ + {0x600, "ALIGNMENT"}, \ + {0x700, "PROGRAM"}, \ + {0x800, "FP_UNAVAIL"}, \ + {0x900, "DECREMENTER"}, \ + {0x980, "HV_DECREMENTER"}, \ + {0xc00, "SYSCALL"}, \ + {0xd00, "TRACE"}, \ + {0xe00, "H_DATA_STORAGE"}, \ + {0xe20, "H_INST_STORAGE"}, \ + {0xe40, "H_EMUL_ASSIST"}, \ + {0xf00, "PERFMON"}, \ + {0xf20, "ALTIVEC"}, \ + {0xf40, "VSX"} + +TRACE_EVENT(kvm_book3s_reenter, + TP_PROTO(int r, struct kvm_vcpu *vcpu), + TP_ARGS(r, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, r ) + __field( unsigned long, pc ) + ), + + TP_fast_assign( + __entry->r = r; + __entry->pc = kvmppc_get_pc(vcpu); + ), + + TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) +); + +#ifdef CONFIG_PPC_BOOK3S_64 + +TRACE_EVENT(kvm_book3s_64_mmu_map, + TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, + struct kvmppc_pte *orig_pte), + TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), + + TP_STRUCT__entry( + __field( unsigned char, flag_w ) + __field( unsigned char, flag_x ) + __field( unsigned long, eaddr ) + __field( unsigned long, hpteg ) + __field( unsigned long, va ) + __field( unsigned long long, vpage ) + __field( unsigned long, hpaddr ) + ), + + TP_fast_assign( + __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; + __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; + __entry->eaddr = orig_pte->eaddr; + __entry->hpteg = hpteg; + __entry->va = va; + __entry->vpage = orig_pte->vpage; + __entry->hpaddr = hpaddr; + ), + + TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", + __entry->flag_w, __entry->flag_x, __entry->eaddr, + __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) +); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +TRACE_EVENT(kvm_book3s_mmu_map, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_invalidate, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_flush, + TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, + unsigned long long p2), + TP_ARGS(type, vcpu, p1, p2), + + TP_STRUCT__entry( + __field( int, count ) + __field( unsigned long long, p1 ) + __field( unsigned long long, p2 ) + __field( const char *, type ) + ), + + TP_fast_assign( + __entry->count = to_book3s(vcpu)->hpte_cache_count; + __entry->p1 = p1; + __entry->p2 = p2; + __entry->type = type; + ), + + TP_printk("Flush %d %sPTEs: %llx - %llx", + __entry->count, __entry->type, __entry->p1, __entry->p2) +); + +TRACE_EVENT(kvm_book3s_slb_found, + TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), + TP_ARGS(gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned long long, gvsid ) + __field( unsigned long long, hvsid ) + ), + + TP_fast_assign( + __entry->gvsid = gvsid; + __entry->hvsid = hvsid; + ), + + TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) +); + +TRACE_EVENT(kvm_book3s_slb_fail, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), + TP_ARGS(sid_map_mask, gvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, gvsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->gvsid = gvsid; + ), + + TP_printk("%x/%x: %llx", __entry->sid_map_mask, + SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) +); + +TRACE_EVENT(kvm_book3s_slb_map, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, + unsigned long long hvsid), + TP_ARGS(sid_map_mask, gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, guest_vsid ) + __field( unsigned long long, host_vsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->guest_vsid = gvsid; + __entry->host_vsid = hvsid; + ), + + TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, + __entry->guest_vsid, __entry->host_vsid) +); + +TRACE_EVENT(kvm_book3s_slbmte, + TP_PROTO(u64 slb_vsid, u64 slb_esid), + TP_ARGS(slb_vsid, slb_esid), + + TP_STRUCT__entry( + __field( u64, slb_vsid ) + __field( u64, slb_esid ) + ), + + TP_fast_assign( + __entry->slb_vsid = slb_vsid; + __entry->slb_esid = slb_esid; + ), + + TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, srr1 ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->srr1 = vcpu->arch.shadow_srr1; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | srr1=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->srr1, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6c856fb8c15..5b960171528 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -121,7 +121,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) ptepage = alloc_pages(flags, 0); if (!ptepage) return NULL; - pgtable_page_ctor(ptepage); + if (!pgtable_page_ctor(ptepage)) { + __free_page(ptepage); + return NULL; + } return ptepage; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 536eec72c0f..9d95786aa80 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -378,6 +378,10 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) __GFP_REPEAT | __GFP_ZERO); if (!page) return NULL; + if (!kernel && !pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } ret = page_address(page); spin_lock(&mm->page_table_lock); @@ -392,9 +396,6 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) } spin_unlock(&mm->page_table_lock); - if (!kernel) - pgtable_page_ctor(page); - return (pte_t *)ret; } diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index fc536f2971c..7553b6a77c6 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -452,7 +452,7 @@ static int kw_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize, */ if (use_irq) { /* Clear completion */ - INIT_COMPLETION(host->complete); + reinit_completion(&host->complete); /* Ack stale interrupts */ kw_write_reg(reg_isr, kw_read_reg(reg_isr)); /* Arm timeout */ @@ -717,7 +717,7 @@ static int pmu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize, return -EINVAL; } - INIT_COMPLETION(comp); + reinit_completion(&comp); req->data[0] = PMU_I2C_CMD; req->reply[0] = 0xff; req->nbytes = sizeof(struct pmu_i2c_hdr) + 1; @@ -748,7 +748,7 @@ static int pmu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize, hdr->bus = PMU_I2C_BUS_STATUS; - INIT_COMPLETION(comp); + reinit_completion(&comp); req->data[0] = PMU_I2C_CMD; req->reply[0] = 0xff; req->nbytes = 2; diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c index 5f997e79d57..16a255255d3 100644 --- a/arch/powerpc/platforms/pseries/suspend.c +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -106,7 +106,7 @@ static int pseries_prepare_late(void) atomic_set(&suspend_data.done, 0); atomic_set(&suspend_data.error, 0); suspend_data.complete = &suspend_work; - INIT_COMPLETION(suspend_work); + reinit_completion(&suspend_work); return 0; } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f75d7e51792..314fced4fc1 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -141,7 +141,6 @@ config S390 select OLD_SIGACTION select OLD_SIGSUSPEND3 select SYSCTL_EXCEPTION_TRACE - select USE_GENERIC_SMP_HELPERS if SMP select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e87ecaa2c56..d5bc3750616 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -38,13 +38,6 @@ struct sca_block { struct sca_entry cpu[64]; } __attribute__((packed)); -#define KVM_NR_PAGE_SIZES 2 -#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8) -#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) -#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) -#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) -#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) - #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 @@ -220,7 +213,6 @@ struct kvm_s390_interrupt_info { /* for local_interrupt.action_flags */ #define ACTION_STORE_ON_STOP (1<<0) #define ACTION_STOP_ON_STOP (1<<1) -#define ACTION_RELOADVCPU_ON_STOP (1<<2) struct kvm_s390_local_interrupt { spinlock_t lock; diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3a74d8af0d6..78d967f180f 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -107,14 +107,13 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) { - int ret, idx; + int ret; /* No virtio-ccw notification? Get out quickly. */ if (!vcpu->kvm->arch.css_support || (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) return -EOPNOTSUPP; - idx = srcu_read_lock(&vcpu->kvm->srcu); /* * The layout is as follows: * - gpr 2 contains the subchannel id (passed as addr) @@ -125,7 +124,6 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) vcpu->run->s.regs.gprs[2], 8, &vcpu->run->s.regs.gprs[3], vcpu->run->s.regs.gprs[4]); - srcu_read_unlock(&vcpu->kvm->srcu, idx); /* * Return cookie in gpr 2, but don't overwrite the register if the diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 99d789e8a01..374a439ccc6 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,20 +18,27 @@ #include <asm/uaccess.h> #include "kvm-s390.h" +/* Convert real to absolute address by applying the prefix of the CPU */ +static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, + unsigned long gaddr) +{ + unsigned long prefix = vcpu->arch.sie_block->prefix; + if (gaddr < 2 * PAGE_SIZE) + gaddr += prefix; + else if (gaddr >= prefix && gaddr < prefix + 2 * PAGE_SIZE) + gaddr -= prefix; + return gaddr; +} + static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu, void __user *gptr, int prefixing) { - unsigned long prefix = vcpu->arch.sie_block->prefix; unsigned long gaddr = (unsigned long) gptr; unsigned long uaddr; - if (prefixing) { - if (gaddr < 2 * PAGE_SIZE) - gaddr += prefix; - else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE)) - gaddr -= prefix; - } + if (prefixing) + gaddr = kvm_s390_real_to_abs(vcpu, gaddr); uaddr = gmap_fault(gaddr, vcpu->arch.gmap); if (IS_ERR_VALUE(uaddr)) uaddr = -EFAULT; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 5ee56e5acc2..5ddbbde6f65 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -62,12 +62,6 @@ static int handle_stop(struct kvm_vcpu *vcpu) trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits); - if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { - vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; - rc = SIE_INTERCEPT_RERUNVCPU; - vcpu->run->exit_reason = KVM_EXIT_INTR; - } - if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 7f1f7ac5cf7..5f79d2d79ca 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -436,6 +436,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); no_timer: + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); spin_lock(&vcpu->arch.local_int.float_int->lock); spin_lock_bh(&vcpu->arch.local_int.lock); add_wait_queue(&vcpu->wq, &wait); @@ -455,6 +456,8 @@ no_timer: remove_wait_queue(&vcpu->wq, &wait); spin_unlock_bh(&vcpu->arch.local_int.lock); spin_unlock(&vcpu->arch.local_int.float_int->lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ed8064cb5c4..569494e01ec 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -695,9 +695,9 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; } -static int __vcpu_run(struct kvm_vcpu *vcpu) +static int vcpu_pre_run(struct kvm_vcpu *vcpu) { - int rc; + int rc, cpuflags; memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16); @@ -715,28 +715,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return rc; vcpu->arch.sie_block->icptcode = 0; - VCPU_EVENT(vcpu, 6, "entering sie flags %x", - atomic_read(&vcpu->arch.sie_block->cpuflags)); - trace_kvm_s390_sie_enter(vcpu, - atomic_read(&vcpu->arch.sie_block->cpuflags)); + cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); + VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); + trace_kvm_s390_sie_enter(vcpu, cpuflags); - /* - * As PF_VCPU will be used in fault handler, between guest_enter - * and guest_exit should be no uaccess. - */ - preempt_disable(); - kvm_guest_enter(); - preempt_enable(); - rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); - kvm_guest_exit(); + return 0; +} + +static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) +{ + int rc; VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); - if (rc > 0) + if (exit_reason >= 0) { rc = 0; - if (rc < 0) { + } else { if (kvm_is_ucontrol(vcpu->kvm)) { rc = SIE_INTERCEPT_UCONTROL; } else { @@ -747,6 +743,49 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); + + if (rc == 0) { + if (kvm_is_ucontrol(vcpu->kvm)) + rc = -EOPNOTSUPP; + else + rc = kvm_handle_sie_intercept(vcpu); + } + + return rc; +} + +static int __vcpu_run(struct kvm_vcpu *vcpu) +{ + int rc, exit_reason; + + /* + * We try to hold kvm->srcu during most of vcpu_run (except when run- + * ning the guest), so that memslots (and other stuff) are protected + */ + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + do { + rc = vcpu_pre_run(vcpu); + if (rc) + break; + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* + * As PF_VCPU will be used in fault handler, between + * guest_enter and guest_exit should be no uaccess. + */ + preempt_disable(); + kvm_guest_enter(); + preempt_enable(); + exit_reason = sie64a(vcpu->arch.sie_block, + vcpu->run->s.regs.gprs); + kvm_guest_exit(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + rc = vcpu_post_run(vcpu, exit_reason); + } while (!signal_pending(current) && !rc); + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); return rc; } @@ -755,7 +794,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int rc; sigset_t sigsaved; -rerun_vcpu: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -788,19 +826,7 @@ rerun_vcpu: } might_fault(); - - do { - rc = __vcpu_run(vcpu); - if (rc) - break; - if (kvm_is_ucontrol(vcpu->kvm)) - rc = -EOPNOTSUPP; - else - rc = kvm_handle_sie_intercept(vcpu); - } while (!signal_pending(current) && !rc); - - if (rc == SIE_INTERCEPT_RERUNVCPU) - goto rerun_vcpu; + rc = __vcpu_run(vcpu); if (signal_pending(current) && !rc) { kvm_run->exit_reason = KVM_EXIT_INTR; @@ -958,6 +984,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; + int idx; long r; switch (ioctl) { @@ -971,7 +998,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_S390_STORE_STATUS: + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_s390_vcpu_store_status(vcpu, arg); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; case KVM_S390_SET_INITIAL_PSW: { psw_t psw; @@ -1067,12 +1096,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index dc99f1ca426..b44912a3294 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -28,8 +28,7 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); extern unsigned long *vfacilities; /* negativ values are error codes, positive values for internal conditions */ -#define SIE_INTERCEPT_RERUNVCPU (1<<0) -#define SIE_INTERCEPT_UCONTROL (1<<1) +#define SIE_INTERCEPT_UCONTROL (1<<0) int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ @@ -91,8 +90,10 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2) { - *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20; - *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; + if (r1) + *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20; + if (r2) + *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; } static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 59200ee275e..2440602e6df 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -30,6 +30,38 @@ #include "kvm-s390.h" #include "trace.h" +/* Handle SCK (SET CLOCK) interception */ +static int handle_set_clock(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *cpup; + s64 hostclk, val; + u64 op2; + int i; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + op2 = kvm_s390_get_base_disp_s(vcpu); + if (op2 & 7) /* Operand must be on a doubleword boundary */ + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + if (get_guest(vcpu, val, (u64 __user *) op2)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + if (store_tod_clock(&hostclk)) { + kvm_s390_set_psw_cc(vcpu, 3); + return 0; + } + val = (val - hostclk) & ~0x3fUL; + + mutex_lock(&vcpu->kvm->lock); + kvm_for_each_vcpu(i, cpup, vcpu->kvm) + cpup->arch.sie_block->epoch = val; + mutex_unlock(&vcpu->kvm->lock); + + kvm_s390_set_psw_cc(vcpu, 0); + return 0; +} + static int handle_set_prefix(struct kvm_vcpu *vcpu) { u64 operand2; @@ -128,6 +160,33 @@ static int handle_skey(struct kvm_vcpu *vcpu) return 0; } +static int handle_test_block(struct kvm_vcpu *vcpu) +{ + unsigned long hva; + gpa_t addr; + int reg2; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + kvm_s390_get_regs_rre(vcpu, NULL, ®2); + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_real_to_abs(vcpu, addr); + + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(hva)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + /* + * We don't expect errors on modern systems, and do not care + * about storage keys (yet), so let's just clear the page. + */ + if (clear_user((void __user *)hva, PAGE_SIZE) != 0) + return -EFAULT; + kvm_s390_set_psw_cc(vcpu, 0); + vcpu->run->s.regs.gprs[0] = 0; + return 0; +} + static int handle_tpi(struct kvm_vcpu *vcpu) { struct kvm_s390_interrupt_info *inti; @@ -438,12 +497,14 @@ out_exception: static const intercept_handler_t b2_handlers[256] = { [0x02] = handle_stidp, + [0x04] = handle_set_clock, [0x10] = handle_set_prefix, [0x11] = handle_store_prefix, [0x12] = handle_store_cpu_address, [0x29] = handle_skey, [0x2a] = handle_skey, [0x2b] = handle_skey, + [0x2c] = handle_test_block, [0x30] = handle_io_inst, [0x31] = handle_io_inst, [0x32] = handle_io_inst, diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0a2e5e08674..e794c88f699 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -772,7 +772,11 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, __free_page(page); return NULL; } - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + kfree(mp); + __free_page(page); + return NULL; + } mp->vmaddr = vmaddr & PMD_MASK; INIT_LIST_HEAD(&mp->mapper); page->index = (unsigned long) mp; @@ -902,7 +906,10 @@ unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) page = alloc_page(GFP_KERNEL|__GFP_REPEAT); if (!page) return NULL; - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } atomic_set(&page->_mapcount, 1); table = (unsigned long *) page_to_phys(page); clear_table(table, _PAGE_INVALID, PAGE_SIZE); @@ -1244,11 +1251,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(lh); else - list_add(lh, (struct list_head *) mm->pmd_huge_pte); - mm->pmd_huge_pte = pgtable; + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) @@ -1260,12 +1267,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); lh = (struct list_head *) pgtable; if (list_empty(lh)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = (pgtable_t) lh->next; + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; list_del(lh); } ptep = (pte_t *) pgtable; diff --git a/arch/score/include/asm/pgalloc.h b/arch/score/include/asm/pgalloc.h index 716b3fd1d86..2e067657db9 100644 --- a/arch/score/include/asm/pgalloc.h +++ b/arch/score/include/asm/pgalloc.h @@ -54,9 +54,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, struct page *pte; pte = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER); - if (pte) { - clear_highpage(pte); - pgtable_page_ctor(pte); + if (!pte) + return NULL; + clear_highpage(pte); + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; } return pte; } diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 224f4bc9925..e78561bc30e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -711,7 +711,6 @@ config CC_STACKPROTECTOR config SMP bool "Symmetric multi-processing support" depends on SYS_SUPPORTS_SMP - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 8c00785c60d..a33673b3687 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -47,7 +47,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, if (!pg) return NULL; page = virt_to_page(pg); - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + quicklist_free(QUICK_PT, NULL, pg); + return NULL; + } return page; } diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 78c4fdb91bc..05fcfc634fc 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -28,7 +28,6 @@ config SPARC select HAVE_ARCH_JUMP_LABEL select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION - select USE_GENERIC_SMP_HELPERS if SMP select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 select HAVE_BPF_JIT @@ -64,6 +63,7 @@ config SPARC64 select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_SYSCALL_TRACEPOINTS + select HAVE_CONTEXT_TRACKING select HAVE_DEBUG_KMEMLEAK select RTC_DRV_CMOS select RTC_DRV_BQ4802 diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h index 76092c4dd27..f668797ae23 100644 --- a/arch/sparc/include/asm/mmu_64.h +++ b/arch/sparc/include/asm/mmu_64.h @@ -93,7 +93,6 @@ typedef struct { spinlock_t lock; unsigned long sparc64_ctx_val; unsigned long huge_pte_count; - struct page *pgtable_page; struct tsb_config tsb_block[MM_NUM_TSBS]; struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; } mm_context_t; diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index e15538899f3..aac53fcea80 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -15,7 +15,10 @@ #define DCACHE_ALIASING_POSSIBLE #endif -#define HPAGE_SHIFT 22 +#define HPAGE_SHIFT 23 +#define REAL_HPAGE_SHIFT 22 + +#define REAL_HPAGE_SIZE (_AC(1,UL) << REAL_HPAGE_SHIFT) #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) @@ -53,8 +56,8 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct pag /* These are used to make use of C type-checking.. */ typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long iopte; } iopte_t; -typedef struct { unsigned int pmd; } pmd_t; -typedef struct { unsigned int pgd; } pgd_t; +typedef struct { unsigned long pmd; } pmd_t; +typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; #define pte_val(x) ((x).pte) @@ -73,8 +76,8 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* .. while these make it easier on the compiler */ typedef unsigned long pte_t; typedef unsigned long iopte_t; -typedef unsigned int pmd_t; -typedef unsigned int pgd_t; +typedef unsigned long pmd_t; +typedef unsigned long pgd_t; typedef unsigned long pgprot_t; #define pte_val(x) (x) @@ -93,18 +96,44 @@ typedef unsigned long pgprot_t; typedef pte_t *pgtable_t; +/* These two values define the virtual address space range in which we + * must forbid 64-bit user processes from making mappings. It used to + * represent precisely the virtual address space hole present in most + * early sparc64 chips including UltraSPARC-I. But now it also is + * further constrained by the limits of our page tables, which is + * 43-bits of virtual address. + */ +#define SPARC64_VA_HOLE_TOP _AC(0xfffffc0000000000,UL) +#define SPARC64_VA_HOLE_BOTTOM _AC(0x0000040000000000,UL) + +/* The next two defines specify the actual exclusion region we + * enforce, wherein we use a 4GB red zone on each side of the VA hole. + */ +#define VA_EXCLUDE_START (SPARC64_VA_HOLE_BOTTOM - (1UL << 32UL)) +#define VA_EXCLUDE_END (SPARC64_VA_HOLE_TOP + (1UL << 32UL)) + #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_32BIT) ? \ - (_AC(0x0000000070000000,UL)) : \ - (_AC(0xfffff80000000000,UL) + (1UL << 32UL))) + _AC(0x0000000070000000,UL) : \ + VA_EXCLUDE_END) #include <asm-generic/memory_model.h> +#define PAGE_OFFSET_BY_BITS(X) (-(_AC(1,UL) << (X))) +extern unsigned long PAGE_OFFSET; + #endif /* !(__ASSEMBLY__) */ -/* We used to stick this into a hard-coded global register (%g4) - * but that does not make sense anymore. +/* The maximum number of physical memory address bits we support, this + * is used to size various tables used to manage kernel TLB misses and + * also the sparsemem code. + */ +#define MAX_PHYS_ADDRESS_BITS 47 + +/* These two shift counts are used when indexing sparc64_valid_addr_bitmap + * and kpte_linear_bitmap. */ -#define PAGE_OFFSET _AC(0xFFFFF80000000000,UL) +#define ILOG2_4MB 22 +#define ILOG2_256MB 28 #ifndef __ASSEMBLY__ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 36760317814..8358dc14495 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -48,18 +48,18 @@ /* PMD_SHIFT determines the size of the area a second-level page * table can map */ -#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4)) +#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) #define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PMD_BITS (PAGE_SHIFT - 2) +#define PMD_BITS (PAGE_SHIFT - 3) /* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4) + PMD_BITS) +#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS) #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define PGDIR_BITS (PAGE_SHIFT - 2) +#define PGDIR_BITS (PAGE_SHIFT - 3) -#if (PGDIR_SHIFT + PGDIR_BITS) != 44 +#if (PGDIR_SHIFT + PGDIR_BITS) != 43 #error Page table parameters do not cover virtual address space properly. #endif @@ -67,35 +67,12 @@ #error PMD_SHIFT must equal HPAGE_SHIFT for transparent huge pages. #endif -/* PMDs point to PTE tables which are 4K aligned. */ -#define PMD_PADDR _AC(0xfffffffe,UL) -#define PMD_PADDR_SHIFT _AC(11,UL) - -#define PMD_ISHUGE _AC(0x00000001,UL) - -/* This is the PMD layout when PMD_ISHUGE is set. With 4MB huge - * pages, this frees up a bunch of bits in the layout that we can - * use for the protection settings and software metadata. - */ -#define PMD_HUGE_PADDR _AC(0xfffff800,UL) -#define PMD_HUGE_PROTBITS _AC(0x000007ff,UL) -#define PMD_HUGE_PRESENT _AC(0x00000400,UL) -#define PMD_HUGE_WRITE _AC(0x00000200,UL) -#define PMD_HUGE_DIRTY _AC(0x00000100,UL) -#define PMD_HUGE_ACCESSED _AC(0x00000080,UL) -#define PMD_HUGE_EXEC _AC(0x00000040,UL) -#define PMD_HUGE_SPLITTING _AC(0x00000020,UL) - -/* PGDs point to PMD tables which are 8K aligned. */ -#define PGD_PADDR _AC(0xfffffffc,UL) -#define PGD_PADDR_SHIFT _AC(11,UL) - #ifndef __ASSEMBLY__ #include <linux/sched.h> /* Entries per page directory level. */ -#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-4)) +#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3)) #define PTRS_PER_PMD (1UL << PMD_BITS) #define PTRS_PER_PGD (1UL << PGDIR_BITS) @@ -112,6 +89,7 @@ #define _PAGE_VALID _AC(0x8000000000000000,UL) /* Valid TTE */ #define _PAGE_R _AC(0x8000000000000000,UL) /* Keep ref bit uptodate*/ #define _PAGE_SPECIAL _AC(0x0200000000000000,UL) /* Special page */ +#define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */ /* Advertise support for _PAGE_SPECIAL */ #define __HAVE_ARCH_PTE_SPECIAL @@ -125,6 +103,7 @@ #define _PAGE_IE_4U _AC(0x0800000000000000,UL) /* Invert Endianness */ #define _PAGE_SOFT2_4U _AC(0x07FC000000000000,UL) /* Software bits, set 2 */ #define _PAGE_SPECIAL_4U _AC(0x0200000000000000,UL) /* Special page */ +#define _PAGE_PMD_HUGE_4U _AC(0x0100000000000000,UL) /* Huge page */ #define _PAGE_RES1_4U _AC(0x0002000000000000,UL) /* Reserved */ #define _PAGE_SZ32MB_4U _AC(0x0001000000000000,UL) /* (Panther) 32MB page */ #define _PAGE_SZ256MB_4U _AC(0x2001000000000000,UL) /* (Panther) 256MB page */ @@ -155,6 +134,7 @@ #define _PAGE_READ_4V _AC(0x0800000000000000,UL) /* Readable SW Bit */ #define _PAGE_WRITE_4V _AC(0x0400000000000000,UL) /* Writable SW Bit */ #define _PAGE_SPECIAL_4V _AC(0x0200000000000000,UL) /* Special page */ +#define _PAGE_PMD_HUGE_4V _AC(0x0100000000000000,UL) /* Huge page */ #define _PAGE_PADDR_4V _AC(0x00FFFFFFFFFFE000,UL) /* paddr[55:13] */ #define _PAGE_IE_4V _AC(0x0000000000001000,UL) /* Invert Endianness */ #define _PAGE_E_4V _AC(0x0000000000000800,UL) /* side-Effect */ @@ -180,6 +160,10 @@ #define _PAGE_SZBITS_4U _PAGE_SZ8K_4U #define _PAGE_SZBITS_4V _PAGE_SZ8K_4V +#if REAL_HPAGE_SHIFT != 22 +#error REAL_HPAGE_SHIFT and _PAGE_SZHUGE_foo must match up +#endif + #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V @@ -239,16 +223,13 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot); -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) - -extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); - -static inline pmd_t pmd_mkhuge(pmd_t pmd) +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - /* Do nothing, mk_pmd() does this part. */ - return pmd; + pte_t pte = pfn_pte(page_nr, pgprot); + + return __pmd(pte_val(pte)); } +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) #endif /* This one can be done with two shifts. */ @@ -309,14 +290,25 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot) : "=r" (mask), "=r" (tmp) : "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U | _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U | - _PAGE_SPECIAL), + _PAGE_SPECIAL | _PAGE_PMD_HUGE | _PAGE_SZALL_4U), "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V | _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V | - _PAGE_SPECIAL)); + _PAGE_SPECIAL | _PAGE_PMD_HUGE | _PAGE_SZALL_4V)); return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask)); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_modify(pte, newprot); + + return __pmd(pte_val(pte)); +} +#endif + static inline pte_t pgoff_to_pte(unsigned long off) { off <<= PAGE_SHIFT; @@ -357,7 +349,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot) */ #define pgprot_noncached pgprot_noncached -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline pte_t pte_mkhuge(pte_t pte) { unsigned long mask; @@ -375,6 +367,17 @@ static inline pte_t pte_mkhuge(pte_t pte) return __pte(pte_val(pte) | mask); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkhuge(pte); + pte_val(pte) |= _PAGE_PMD_HUGE; + + return __pmd(pte_val(pte)); +} +#endif #endif static inline pte_t pte_mkdirty(pte_t pte) @@ -626,91 +629,130 @@ static inline unsigned long pte_special(pte_t pte) return pte_val(pte) & _PAGE_SPECIAL; } -static inline int pmd_large(pmd_t pmd) +static inline unsigned long pmd_large(pmd_t pmd) { - return (pmd_val(pmd) & (PMD_ISHUGE | PMD_HUGE_PRESENT)) == - (PMD_ISHUGE | PMD_HUGE_PRESENT); + pte_t pte = __pte(pmd_val(pmd)); + + return (pte_val(pte) & _PAGE_PMD_HUGE) && pte_present(pte); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_young(pmd_t pmd) +static inline unsigned long pmd_young(pmd_t pmd) { - return pmd_val(pmd) & PMD_HUGE_ACCESSED; + pte_t pte = __pte(pmd_val(pmd)); + + return pte_young(pte); } -static inline int pmd_write(pmd_t pmd) +static inline unsigned long pmd_write(pmd_t pmd) { - return pmd_val(pmd) & PMD_HUGE_WRITE; + pte_t pte = __pte(pmd_val(pmd)); + + return pte_write(pte); } static inline unsigned long pmd_pfn(pmd_t pmd) { - unsigned long val = pmd_val(pmd) & PMD_HUGE_PADDR; + pte_t pte = __pte(pmd_val(pmd)); - return val >> (PAGE_SHIFT - PMD_PADDR_SHIFT); + return pte_pfn(pte); } -static inline int pmd_trans_splitting(pmd_t pmd) +static inline unsigned long pmd_trans_huge(pmd_t pmd) { - return (pmd_val(pmd) & (PMD_ISHUGE|PMD_HUGE_SPLITTING)) == - (PMD_ISHUGE|PMD_HUGE_SPLITTING); + pte_t pte = __pte(pmd_val(pmd)); + + return pte_val(pte) & _PAGE_PMD_HUGE; } -static inline int pmd_trans_huge(pmd_t pmd) +static inline unsigned long pmd_trans_splitting(pmd_t pmd) { - return pmd_val(pmd) & PMD_ISHUGE; + pte_t pte = __pte(pmd_val(pmd)); + + return pmd_trans_huge(pmd) && pte_special(pte); } #define has_transparent_hugepage() 1 static inline pmd_t pmd_mkold(pmd_t pmd) { - pmd_val(pmd) &= ~PMD_HUGE_ACCESSED; - return pmd; + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkold(pte); + + return __pmd(pte_val(pte)); } static inline pmd_t pmd_wrprotect(pmd_t pmd) { - pmd_val(pmd) &= ~PMD_HUGE_WRITE; - return pmd; + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_wrprotect(pte); + + return __pmd(pte_val(pte)); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { - pmd_val(pmd) |= PMD_HUGE_DIRTY; - return pmd; + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkdirty(pte); + + return __pmd(pte_val(pte)); } static inline pmd_t pmd_mkyoung(pmd_t pmd) { - pmd_val(pmd) |= PMD_HUGE_ACCESSED; - return pmd; + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkyoung(pte); + + return __pmd(pte_val(pte)); } static inline pmd_t pmd_mkwrite(pmd_t pmd) { - pmd_val(pmd) |= PMD_HUGE_WRITE; - return pmd; + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkwrite(pte); + + return __pmd(pte_val(pte)); } static inline pmd_t pmd_mknotpresent(pmd_t pmd) { - pmd_val(pmd) &= ~PMD_HUGE_PRESENT; + unsigned long mask; + + if (tlb_type == hypervisor) + mask = _PAGE_PRESENT_4V; + else + mask = _PAGE_PRESENT_4U; + + pmd_val(pmd) &= ~mask; + return pmd; } static inline pmd_t pmd_mksplitting(pmd_t pmd) { - pmd_val(pmd) |= PMD_HUGE_SPLITTING; - return pmd; + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkspecial(pte); + + return __pmd(pte_val(pte)); } -extern pgprot_t pmd_pgprot(pmd_t entry); +static inline pgprot_t pmd_pgprot(pmd_t entry) +{ + unsigned long val = pmd_val(entry); + + return __pgprot(val); +} #endif static inline int pmd_present(pmd_t pmd) { - return pmd_val(pmd) != 0U; + return pmd_val(pmd) != 0UL; } #define pmd_none(pmd) (!pmd_val(pmd)) @@ -728,33 +770,32 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void pmd_set(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { - unsigned long val = __pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT; + unsigned long val = __pa((unsigned long) (ptep)); pmd_val(*pmdp) = val; } #define pud_set(pudp, pmdp) \ - (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> PGD_PADDR_SHIFT)) + (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)))) static inline unsigned long __pmd_page(pmd_t pmd) { - unsigned long paddr = (unsigned long) pmd_val(pmd); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_val(pmd) & PMD_ISHUGE) - paddr &= PMD_HUGE_PADDR; -#endif - paddr <<= PMD_PADDR_SHIFT; - return ((unsigned long) __va(paddr)); + pte_t pte = __pte(pmd_val(pmd)); + unsigned long pfn; + + pfn = pte_pfn(pte); + + return ((unsigned long) __va(pfn << PAGE_SHIFT)); } #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) #define pud_page_vaddr(pud) \ - ((unsigned long) __va((((unsigned long)pud_val(pud))<<PGD_PADDR_SHIFT))) + ((unsigned long) __va(pud_val(pud))) #define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud)) #define pmd_bad(pmd) (0) -#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0U) +#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0UL) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (0) #define pud_present(pud) (pud_val(pud) != 0U) -#define pud_clear(pudp) (pud_val(*(pudp)) = 0U) +#define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) /* Same in both SUN4V and SUN4U. */ #define pte_none(pte) (!pte_val(pte)) @@ -789,7 +830,7 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, pmd_t *pmdp) { pmd_t pmd = *pmdp; - set_pmd_at(mm, addr, pmdp, __pmd(0U)); + set_pmd_at(mm, addr, pmdp, __pmd(0UL)); return pmd; } @@ -837,8 +878,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, }) #endif -extern pgd_t swapper_pg_dir[2048]; -extern pmd_t swapper_low_pmd_dir[2048]; +extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; +extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD]; extern void paging_init(void); extern unsigned long find_ecache_flush_span(unsigned long size); diff --git a/arch/sparc/include/asm/sparsemem.h b/arch/sparc/include/asm/sparsemem.h index b99d4e4b6d2..e5e1752d5d7 100644 --- a/arch/sparc/include/asm/sparsemem.h +++ b/arch/sparc/include/asm/sparsemem.h @@ -3,9 +3,11 @@ #ifdef __KERNEL__ +#include <asm/page.h> + #define SECTION_SIZE_BITS 30 -#define MAX_PHYSADDR_BITS 42 -#define MAX_PHYSMEM_BITS 42 +#define MAX_PHYSADDR_BITS MAX_PHYS_ADDRESS_BITS +#define MAX_PHYSMEM_BITS MAX_PHYS_ADDRESS_BITS #endif /* !(__KERNEL__) */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index d5e50425107..5d9292ab107 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -192,7 +192,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */ /* flag bit 6 is available */ #define TIF_32BIT 7 /* 32-bit binary */ -/* flag bit 8 is available */ +#define TIF_NOHZ 8 /* in adaptive nohz mode */ #define TIF_SECCOMP 9 /* secure computing */ #define TIF_SYSCALL_AUDIT 10 /* syscall auditing active */ #define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ @@ -210,6 +210,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_UNALIGNED (1<<TIF_UNALIGNED) #define _TIF_32BIT (1<<TIF_32BIT) +#define _TIF_NOHZ (1<<TIF_NOHZ) #define _TIF_SECCOMP (1<<TIF_SECCOMP) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h index e696432b950..2230f80d9fe 100644 --- a/arch/sparc/include/asm/tsb.h +++ b/arch/sparc/include/asm/tsb.h @@ -142,98 +142,39 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end; or REG1, %lo(swapper_pg_dir), REG1; \ sllx VADDR, 64 - (PGDIR_SHIFT + PGDIR_BITS), REG2; \ srlx REG2, 64 - PAGE_SHIFT, REG2; \ - andn REG2, 0x3, REG2; \ - lduw [REG1 + REG2], REG1; \ + andn REG2, 0x7, REG2; \ + ldx [REG1 + REG2], REG1; \ brz,pn REG1, FAIL_LABEL; \ sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ srlx REG2, 64 - PAGE_SHIFT, REG2; \ - sllx REG1, PGD_PADDR_SHIFT, REG1; \ - andn REG2, 0x3, REG2; \ - lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ + andn REG2, 0x7, REG2; \ + ldxa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ brz,pn REG1, FAIL_LABEL; \ sllx VADDR, 64 - PMD_SHIFT, REG2; \ - srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \ - sllx REG1, PMD_PADDR_SHIFT, REG1; \ + srlx REG2, 64 - PAGE_SHIFT, REG2; \ andn REG2, 0x7, REG2; \ add REG1, REG2, REG1; - /* These macros exists only to make the PMD translator below - * easier to read. It hides the ELF section switch for the - * sun4v code patching. - */ -#define OR_PTE_BIT_1INSN(REG, NAME) \ -661: or REG, _PAGE_##NAME##_4U, REG; \ - .section .sun4v_1insn_patch, "ax"; \ - .word 661b; \ - or REG, _PAGE_##NAME##_4V, REG; \ - .previous; - -#define OR_PTE_BIT_2INSN(REG, TMP, NAME) \ -661: sethi %hi(_PAGE_##NAME##_4U), TMP; \ - or REG, TMP, REG; \ - .section .sun4v_2insn_patch, "ax"; \ - .word 661b; \ - mov -1, TMP; \ - or REG, _PAGE_##NAME##_4V, REG; \ - .previous; - - /* Load into REG the PTE value for VALID, CACHE, and SZHUGE. */ -#define BUILD_PTE_VALID_SZHUGE_CACHE(REG) \ -661: sethi %uhi(_PAGE_VALID|_PAGE_SZHUGE_4U), REG; \ - .section .sun4v_1insn_patch, "ax"; \ - .word 661b; \ - sethi %uhi(_PAGE_VALID), REG; \ - .previous; \ - sllx REG, 32, REG; \ -661: or REG, _PAGE_CP_4U|_PAGE_CV_4U, REG; \ - .section .sun4v_1insn_patch, "ax"; \ - .word 661b; \ - or REG, _PAGE_CP_4V|_PAGE_CV_4V|_PAGE_SZHUGE_4V, REG; \ - .previous; - /* PMD has been loaded into REG1, interpret the value, seeing * if it is a HUGE PMD or a normal one. If it is not valid * then jump to FAIL_LABEL. If it is a HUGE PMD, and it * translates to a valid PTE, branch to PTE_LABEL. * - * We translate the PMD by hand, one bit at a time, - * constructing the huge PTE. - * - * So we construct the PTE in REG2 as follows: - * - * 1) Extract the PMD PFN from REG1 and place it into REG2. - * - * 2) Translate PMD protection bits in REG1 into REG2, one bit - * at a time using andcc tests on REG1 and OR's into REG2. - * - * Only two bits to be concerned with here, EXEC and WRITE. - * Now REG1 is freed up and we can use it as a temporary. - * - * 3) Construct the VALID, CACHE, and page size PTE bits in - * REG1, OR with REG2 to form final PTE. + * We have to propagate the 4MB bit of the virtual address + * because we are fabricating 8MB pages using 4MB hw pages. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \ - brz,pn REG1, FAIL_LABEL; \ - andcc REG1, PMD_ISHUGE, %g0; \ - be,pt %xcc, 700f; \ - and REG1, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED, REG2; \ - cmp REG2, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED; \ - bne,pn %xcc, FAIL_LABEL; \ - andn REG1, PMD_HUGE_PROTBITS, REG2; \ - sllx REG2, PMD_PADDR_SHIFT, REG2; \ - /* REG2 now holds PFN << PAGE_SHIFT */ \ - andcc REG1, PMD_HUGE_WRITE, %g0; \ - bne,a,pt %xcc, 1f; \ - OR_PTE_BIT_1INSN(REG2, W); \ -1: andcc REG1, PMD_HUGE_EXEC, %g0; \ - be,pt %xcc, 1f; \ - nop; \ - OR_PTE_BIT_2INSN(REG2, REG1, EXEC); \ - /* REG1 can now be clobbered, build final PTE */ \ -1: BUILD_PTE_VALID_SZHUGE_CACHE(REG1); \ - ba,pt %xcc, PTE_LABEL; \ - or REG1, REG2, REG1; \ + brz,pn REG1, FAIL_LABEL; \ + sethi %uhi(_PAGE_PMD_HUGE), REG2; \ + sllx REG2, 32, REG2; \ + andcc REG1, REG2, %g0; \ + be,pt %xcc, 700f; \ + sethi %hi(4 * 1024 * 1024), REG2; \ + andn REG1, REG2, REG1; \ + and VADDR, REG2, REG2; \ + brlz,pt REG1, PTE_LABEL; \ + or REG1, REG2, REG1; \ 700: #else #define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \ @@ -253,18 +194,16 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end; #define USER_PGTABLE_WALK_TL1(VADDR, PHYS_PGD, REG1, REG2, FAIL_LABEL) \ sllx VADDR, 64 - (PGDIR_SHIFT + PGDIR_BITS), REG2; \ srlx REG2, 64 - PAGE_SHIFT, REG2; \ - andn REG2, 0x3, REG2; \ - lduwa [PHYS_PGD + REG2] ASI_PHYS_USE_EC, REG1; \ + andn REG2, 0x7, REG2; \ + ldxa [PHYS_PGD + REG2] ASI_PHYS_USE_EC, REG1; \ brz,pn REG1, FAIL_LABEL; \ sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ srlx REG2, 64 - PAGE_SHIFT, REG2; \ - sllx REG1, PGD_PADDR_SHIFT, REG1; \ - andn REG2, 0x3, REG2; \ - lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ + andn REG2, 0x7, REG2; \ + ldxa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \ sllx VADDR, 64 - PMD_SHIFT, REG2; \ - srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \ - sllx REG1, PMD_PADDR_SHIFT, REG1; \ + srlx REG2, 64 - PAGE_SHIFT, REG2; \ andn REG2, 0x7, REG2; \ add REG1, REG2, REG1; \ ldxa [REG1] ASI_PHYS_USE_EC, REG1; \ diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h index 9c179fbfb21..140966fbd30 100644 --- a/arch/sparc/kernel/entry.h +++ b/arch/sparc/kernel/entry.h @@ -88,7 +88,6 @@ extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); extern void bad_trap_tl1(struct pt_regs *regs, long lvl); -extern void do_fpe_common(struct pt_regs *regs); extern void do_fpieee(struct pt_regs *regs); extern void do_fpother(struct pt_regs *regs); extern void do_tof(struct pt_regs *regs); diff --git a/arch/sparc/kernel/kgdb_64.c b/arch/sparc/kernel/kgdb_64.c index 53c0a82e603..60b19f50c80 100644 --- a/arch/sparc/kernel/kgdb_64.c +++ b/arch/sparc/kernel/kgdb_64.c @@ -159,11 +159,12 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, asmlinkage void kgdb_trap(unsigned long trap_level, struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); unsigned long flags; if (user_mode(regs)) { bad_trap(regs, trap_level); - return; + goto out; } flushw_all(); @@ -171,6 +172,8 @@ asmlinkage void kgdb_trap(unsigned long trap_level, struct pt_regs *regs) local_irq_save(flags); kgdb_handle_exception(0x172, SIGTRAP, 0, regs); local_irq_restore(flags); +out: + exception_exit(prev_state); } int kgdb_arch_init(void) diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c index e72212148d2..5a09fd315e5 100644 --- a/arch/sparc/kernel/kprobes.c +++ b/arch/sparc/kernel/kprobes.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kdebug.h> #include <linux/slab.h> +#include <linux/context_tracking.h> #include <asm/signal.h> #include <asm/cacheflush.h> #include <asm/uaccess.h> @@ -418,12 +419,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, asmlinkage void __kprobes kprobe_trap(unsigned long trap_level, struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + BUG_ON(trap_level != 0x170 && trap_level != 0x171); if (user_mode(regs)) { local_irq_enable(); bad_trap(regs, trap_level); - return; + goto out; } /* trap_level == 0x170 --> ta 0x70 @@ -433,6 +436,8 @@ asmlinkage void __kprobes kprobe_trap(unsigned long trap_level, (trap_level == 0x170) ? "debug" : "debug_2", regs, 0, trap_level, SIGTRAP) != NOTIFY_STOP) bad_trap(regs, trap_level); +out: + exception_exit(prev_state); } /* Jprobes support. */ diff --git a/arch/sparc/kernel/ktlb.S b/arch/sparc/kernel/ktlb.S index fde5a419cf2..542e96ac4d3 100644 --- a/arch/sparc/kernel/ktlb.S +++ b/arch/sparc/kernel/ktlb.S @@ -153,12 +153,19 @@ kvmap_dtlb_tsb4m_miss: /* Clear the PAGE_OFFSET top virtual bits, shift * down to get PFN, and make sure PFN is in range. */ - sllx %g4, 21, %g5 +661: sllx %g4, 0, %g5 + .section .page_offset_shift_patch, "ax" + .word 661b + .previous /* Check to see if we know about valid memory at the 4MB * chunk this physical address will reside within. */ - srlx %g5, 21 + 41, %g2 +661: srlx %g5, MAX_PHYS_ADDRESS_BITS, %g2 + .section .page_offset_shift_patch, "ax" + .word 661b + .previous + brnz,pn %g2, kvmap_dtlb_longpath nop @@ -176,7 +183,11 @@ valid_addr_bitmap_patch: or %g7, %lo(sparc64_valid_addr_bitmap), %g7 .previous - srlx %g5, 21 + 22, %g2 +661: srlx %g5, ILOG2_4MB, %g2 + .section .page_offset_shift_patch, "ax" + .word 661b + .previous + srlx %g2, 6, %g5 and %g2, 63, %g2 sllx %g5, 3, %g5 @@ -189,9 +200,18 @@ valid_addr_bitmap_patch: 2: sethi %hi(kpte_linear_bitmap), %g2 /* Get the 256MB physical address index. */ - sllx %g4, 21, %g5 +661: sllx %g4, 0, %g5 + .section .page_offset_shift_patch, "ax" + .word 661b + .previous + or %g2, %lo(kpte_linear_bitmap), %g2 - srlx %g5, 21 + 28, %g5 + +661: srlx %g5, ILOG2_256MB, %g5 + .section .page_offset_shift_patch, "ax" + .word 661b + .previous + and %g5, (32 - 1), %g7 /* Divide by 32 to get the offset into the bitmask. */ diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index bc4d3f5d2e5..cb021453de2 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -398,8 +398,8 @@ static void apb_fake_ranges(struct pci_dev *dev, apb_calc_first_last(map, &first, &last); res = bus->resource[1]; res->flags = IORESOURCE_MEM; - region.start = (first << 21); - region.end = (last << 21) + ((1 << 21) - 1); + region.start = (first << 29); + region.end = (last << 29) + ((1 << 29) - 1); pcibios_bus_to_resource(dev, res, ®ion); } diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index baebab21549..32a280ec38c 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -31,6 +31,7 @@ #include <linux/elfcore.h> #include <linux/sysrq.h> #include <linux/nmi.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -557,6 +558,7 @@ void fault_in_user_windows(void) barf: set_thread_wsaved(window + 1); + user_exit(); do_exit(SIGILL); } diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 773c1f2983c..c13c9f25d83 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -27,6 +27,7 @@ #include <trace/syscall.h> #include <linux/compat.h> #include <linux/elf.h> +#include <linux/context_tracking.h> #include <asm/asi.h> #include <asm/pgtable.h> @@ -1066,6 +1067,9 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) /* do the secure computing check first */ secure_computing_strict(regs->u_regs[UREG_G1]); + if (test_thread_flag(TIF_NOHZ)) + user_exit(); + if (test_thread_flag(TIF_SYSCALL_TRACE)) ret = tracehook_report_syscall_entry(regs); @@ -1086,6 +1090,9 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) asmlinkage void syscall_trace_leave(struct pt_regs *regs) { + if (test_thread_flag(TIF_NOHZ)) + user_exit(); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) @@ -1093,4 +1100,7 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs) if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); + + if (test_thread_flag(TIF_NOHZ)) + user_enter(); } diff --git a/arch/sparc/kernel/rtrap_64.S b/arch/sparc/kernel/rtrap_64.S index afa2a9e3d0a..a954eb81881 100644 --- a/arch/sparc/kernel/rtrap_64.S +++ b/arch/sparc/kernel/rtrap_64.S @@ -18,10 +18,16 @@ #define RTRAP_PSTATE_IRQOFF (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV) #define RTRAP_PSTATE_AG_IRQOFF (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG) +#ifdef CONFIG_CONTEXT_TRACKING +# define SCHEDULE_USER schedule_user +#else +# define SCHEDULE_USER schedule +#endif + .text .align 32 __handle_preemption: - call schedule + call SCHEDULE_USER wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 35923e8abd8..cd91d010e6d 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -23,6 +23,7 @@ #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/bitops.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/ptrace.h> @@ -43,6 +44,7 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs) { struct ucontext __user *ucp = (struct ucontext __user *) regs->u_regs[UREG_I0]; + enum ctx_state prev_state = exception_enter(); mc_gregset_t __user *grp; unsigned long pc, npc, tstate; unsigned long fp, i7; @@ -129,16 +131,19 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs) } if (err) goto do_sigsegv; - +out: + exception_exit(prev_state); return; do_sigsegv: force_sig(SIGSEGV, current); + goto out; } asmlinkage void sparc64_get_context(struct pt_regs *regs) { struct ucontext __user *ucp = (struct ucontext __user *) regs->u_regs[UREG_I0]; + enum ctx_state prev_state = exception_enter(); mc_gregset_t __user *grp; mcontext_t __user *mcp; unsigned long fp, i7; @@ -220,10 +225,12 @@ asmlinkage void sparc64_get_context(struct pt_regs *regs) } if (err) goto do_sigsegv; - +out: + exception_exit(prev_state); return; do_sigsegv: force_sig(SIGSEGV, current); + goto out; } struct rt_signal_frame { @@ -528,11 +535,13 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags) { + user_exit(); if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, orig_i0); if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } + user_enter(); } diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e142545244f..b66a5338231 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1399,8 +1399,13 @@ void __init smp_cpus_done(unsigned int max_cpus) void smp_send_reschedule(int cpu) { - xcall_deliver((u64) &xcall_receive_signal, 0, 0, - cpumask_of(cpu)); + if (cpu == smp_processor_id()) { + WARN_ON_ONCE(preemptible()); + set_softint(1 << PIL_SMP_RECEIVE_SIGNAL); + } else { + xcall_deliver((u64) &xcall_receive_signal, + 0, 0, cpumask_of(cpu)); + } } void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) diff --git a/arch/sparc/kernel/sun4v_tlb_miss.S b/arch/sparc/kernel/sun4v_tlb_miss.S index bde867fd71e..e0c09bf8561 100644 --- a/arch/sparc/kernel/sun4v_tlb_miss.S +++ b/arch/sparc/kernel/sun4v_tlb_miss.S @@ -182,7 +182,7 @@ sun4v_tsb_miss_common: cmp %g5, -1 be,pt %xcc, 80f nop - COMPUTE_TSB_PTR(%g5, %g4, HPAGE_SHIFT, %g2, %g7) + COMPUTE_TSB_PTR(%g5, %g4, REAL_HPAGE_SHIFT, %g2, %g7) /* That clobbered %g2, reload it. */ ldxa [%g0] ASI_SCRATCHPAD, %g2 diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 51561b8b15b..beb0b5a5f21 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -24,6 +24,7 @@ #include <linux/personality.h> #include <linux/random.h> #include <linux/export.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/utrap.h> @@ -39,9 +40,6 @@ asmlinkage unsigned long sys_getpagesize(void) return PAGE_SIZE; } -#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL)) -#define VA_EXCLUDE_END (0xfffff80000000000UL + (1UL << 32UL)) - /* Does addr --> addr+len fall within 4GB of the VA-space hole or * overflow past the end of the 64-bit address space? */ @@ -499,6 +497,7 @@ asmlinkage unsigned long c_sys_nis_syscall(struct pt_regs *regs) asmlinkage void sparc_breakpoint(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); siginfo_t info; if (test_thread_flag(TIF_32BIT)) { @@ -517,6 +516,7 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs) #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc); #endif + exception_exit(prev_state); } extern void check_pending(int signum); diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S index d950197a17e..87729fff13b 100644 --- a/arch/sparc/kernel/syscalls.S +++ b/arch/sparc/kernel/syscalls.S @@ -52,7 +52,7 @@ sys32_rt_sigreturn: #endif .align 32 1: ldx [%g6 + TI_FLAGS], %l5 - andcc %l5, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0 + andcc %l5, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0 be,pt %icc, rtrap nop call syscall_trace_leave @@ -184,7 +184,7 @@ linux_sparc_syscall32: srl %i3, 0, %o3 ! IEU0 srl %i2, 0, %o2 ! IEU0 Group - andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0 + andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0 bne,pn %icc, linux_syscall_trace32 ! CTI mov %i0, %l5 ! IEU1 5: call %l7 ! CTI Group brk forced @@ -207,7 +207,7 @@ linux_sparc_syscall: mov %i3, %o3 ! IEU1 mov %i4, %o4 ! IEU0 Group - andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0 + andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0 bne,pn %icc, linux_syscall_trace ! CTI Group mov %i0, %l5 ! IEU0 2: call %l7 ! CTI Group brk forced @@ -223,7 +223,7 @@ ret_sys_call: cmp %o0, -ERESTART_RESTARTBLOCK bgeu,pn %xcc, 1f - andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0 + andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0 ldx [%sp + PTREGS_OFF + PT_V9_TNPC], %l1 ! pc = npc 2: diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index b3f833ab90e..4ced92f0535 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -20,6 +20,7 @@ #include <linux/ftrace.h> #include <linux/reboot.h> #include <linux/gfp.h> +#include <linux/context_tracking.h> #include <asm/smp.h> #include <asm/delay.h> @@ -186,11 +187,12 @@ EXPORT_SYMBOL_GPL(unregister_dimm_printer); void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { + enum ctx_state prev_state = exception_enter(); siginfo_t info; if (notify_die(DIE_TRAP, "instruction access exception", regs, 0, 0x8, SIGTRAP) == NOTIFY_STOP) - return; + goto out; if (regs->tstate & TSTATE_PRIV) { printk("spitfire_insn_access_exception: SFSR[%016lx] " @@ -207,6 +209,8 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un info.si_addr = (void __user *)regs->tpc; info.si_trapno = 0; force_sig_info(SIGSEGV, &info, current); +out: + exception_exit(prev_state); } void spitfire_insn_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) @@ -260,11 +264,12 @@ void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, u void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { + enum ctx_state prev_state = exception_enter(); siginfo_t info; if (notify_die(DIE_TRAP, "data access exception", regs, 0, 0x30, SIGTRAP) == NOTIFY_STOP) - return; + goto out; if (regs->tstate & TSTATE_PRIV) { /* Test if this comes from uaccess places. */ @@ -280,7 +285,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un #endif regs->tpc = entry->fixup; regs->tnpc = regs->tpc + 4; - return; + goto out; } /* Shit... */ printk("spitfire_data_access_exception: SFSR[%016lx] " @@ -294,6 +299,8 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un info.si_addr = (void __user *)sfar; info.si_trapno = 0; force_sig_info(SIGSEGV, &info, current); +out: + exception_exit(prev_state); } void spitfire_data_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) @@ -1994,6 +2001,7 @@ static void sun4v_log_error(struct pt_regs *regs, struct sun4v_error_entry *ent, */ void sun4v_resum_error(struct pt_regs *regs, unsigned long offset) { + enum ctx_state prev_state = exception_enter(); struct sun4v_error_entry *ent, local_copy; struct trap_per_cpu *tb; unsigned long paddr; @@ -2022,12 +2030,14 @@ void sun4v_resum_error(struct pt_regs *regs, unsigned long offset) pr_info("Shutdown request, %u seconds...\n", local_copy.err_secs); orderly_poweroff(true); - return; + goto out; } sun4v_log_error(regs, &local_copy, cpu, KERN_ERR "RESUMABLE ERROR", &sun4v_resum_oflow_cnt); +out: + exception_exit(prev_state); } /* If we try to printk() we'll probably make matters worse, by trying @@ -2152,7 +2162,7 @@ void hypervisor_tlbop_error_xcall(unsigned long err, unsigned long op) err, op); } -void do_fpe_common(struct pt_regs *regs) +static void do_fpe_common(struct pt_regs *regs) { if (regs->tstate & TSTATE_PRIV) { regs->tpc = regs->tnpc; @@ -2188,23 +2198,28 @@ void do_fpe_common(struct pt_regs *regs) void do_fpieee(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + if (notify_die(DIE_TRAP, "fpu exception ieee", regs, 0, 0x24, SIGFPE) == NOTIFY_STOP) - return; + goto out; do_fpe_common(regs); +out: + exception_exit(prev_state); } extern int do_mathemu(struct pt_regs *, struct fpustate *, bool); void do_fpother(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); struct fpustate *f = FPUSTATE; int ret = 0; if (notify_die(DIE_TRAP, "fpu exception other", regs, 0, 0x25, SIGFPE) == NOTIFY_STOP) - return; + goto out; switch ((current_thread_info()->xfsr[0] & 0x1c000)) { case (2 << 14): /* unfinished_FPop */ @@ -2213,17 +2228,20 @@ void do_fpother(struct pt_regs *regs) break; } if (ret) - return; + goto out; do_fpe_common(regs); +out: + exception_exit(prev_state); } void do_tof(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); siginfo_t info; if (notify_die(DIE_TRAP, "tagged arithmetic overflow", regs, 0, 0x26, SIGEMT) == NOTIFY_STOP) - return; + goto out; if (regs->tstate & TSTATE_PRIV) die_if_kernel("Penguin overflow trap from kernel mode", regs); @@ -2237,15 +2255,18 @@ void do_tof(struct pt_regs *regs) info.si_addr = (void __user *)regs->tpc; info.si_trapno = 0; force_sig_info(SIGEMT, &info, current); +out: + exception_exit(prev_state); } void do_div0(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); siginfo_t info; if (notify_die(DIE_TRAP, "integer division by zero", regs, 0, 0x28, SIGFPE) == NOTIFY_STOP) - return; + goto out; if (regs->tstate & TSTATE_PRIV) die_if_kernel("TL0: Kernel divide by zero.", regs); @@ -2259,6 +2280,8 @@ void do_div0(struct pt_regs *regs) info.si_addr = (void __user *)regs->tpc; info.si_trapno = 0; force_sig_info(SIGFPE, &info, current); +out: + exception_exit(prev_state); } static void instruction_dump(unsigned int *pc) @@ -2415,6 +2438,7 @@ extern int handle_ldf_stq(u32 insn, struct pt_regs *regs); void do_illegal_instruction(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); unsigned long pc = regs->tpc; unsigned long tstate = regs->tstate; u32 insn; @@ -2422,7 +2446,7 @@ void do_illegal_instruction(struct pt_regs *regs) if (notify_die(DIE_TRAP, "illegal instruction", regs, 0, 0x10, SIGILL) == NOTIFY_STOP) - return; + goto out; if (tstate & TSTATE_PRIV) die_if_kernel("Kernel illegal instruction", regs); @@ -2431,14 +2455,14 @@ void do_illegal_instruction(struct pt_regs *regs) if (get_user(insn, (u32 __user *) pc) != -EFAULT) { if ((insn & 0xc1ffc000) == 0x81700000) /* POPC */ { if (handle_popc(insn, regs)) - return; + goto out; } else if ((insn & 0xc1580000) == 0xc1100000) /* LDQ/STQ */ { if (handle_ldf_stq(insn, regs)) - return; + goto out; } else if (tlb_type == hypervisor) { if ((insn & VIS_OPCODE_MASK) == VIS_OPCODE_VAL) { if (!vis_emul(regs, insn)) - return; + goto out; } else { struct fpustate *f = FPUSTATE; @@ -2448,7 +2472,7 @@ void do_illegal_instruction(struct pt_regs *regs) * Trap in the %fsr to unimplemented_FPop. */ if (do_mathemu(regs, f, true)) - return; + goto out; } } } @@ -2458,21 +2482,24 @@ void do_illegal_instruction(struct pt_regs *regs) info.si_addr = (void __user *)pc; info.si_trapno = 0; force_sig_info(SIGILL, &info, current); +out: + exception_exit(prev_state); } extern void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn); void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr) { + enum ctx_state prev_state = exception_enter(); siginfo_t info; if (notify_die(DIE_TRAP, "memory address unaligned", regs, 0, 0x34, SIGSEGV) == NOTIFY_STOP) - return; + goto out; if (regs->tstate & TSTATE_PRIV) { kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc)); - return; + goto out; } info.si_signo = SIGBUS; info.si_errno = 0; @@ -2480,6 +2507,8 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo info.si_addr = (void __user *)sfar; info.si_trapno = 0; force_sig_info(SIGBUS, &info, current); +out: + exception_exit(prev_state); } void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx) @@ -2504,11 +2533,12 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c void do_privop(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); siginfo_t info; if (notify_die(DIE_TRAP, "privileged operation", regs, 0, 0x11, SIGILL) == NOTIFY_STOP) - return; + goto out; if (test_thread_flag(TIF_32BIT)) { regs->tpc &= 0xffffffff; @@ -2520,6 +2550,8 @@ void do_privop(struct pt_regs *regs) info.si_addr = (void __user *)regs->tpc; info.si_trapno = 0; force_sig_info(SIGILL, &info, current); +out: + exception_exit(prev_state); } void do_privact(struct pt_regs *regs) @@ -2530,99 +2562,116 @@ void do_privact(struct pt_regs *regs) /* Trap level 1 stuff or other traps we should never see... */ void do_cee(struct pt_regs *regs) { + exception_enter(); die_if_kernel("TL0: Cache Error Exception", regs); } void do_cee_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: Cache Error Exception", regs); } void do_dae_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: Data Access Exception", regs); } void do_iae_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: Instruction Access Exception", regs); } void do_div0_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: DIV0 Exception", regs); } void do_fpdis_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: FPU Disabled", regs); } void do_fpieee_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: FPU IEEE Exception", regs); } void do_fpother_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: FPU Other Exception", regs); } void do_ill_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: Illegal Instruction Exception", regs); } void do_irq_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: IRQ Exception", regs); } void do_lddfmna_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: LDDF Exception", regs); } void do_stdfmna_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: STDF Exception", regs); } void do_paw(struct pt_regs *regs) { + exception_enter(); die_if_kernel("TL0: Phys Watchpoint Exception", regs); } void do_paw_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: Phys Watchpoint Exception", regs); } void do_vaw(struct pt_regs *regs) { + exception_enter(); die_if_kernel("TL0: Virt Watchpoint Exception", regs); } void do_vaw_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: Virt Watchpoint Exception", regs); } void do_tof_tl1(struct pt_regs *regs) { + exception_enter(); dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); die_if_kernel("TL1: Tag Overflow Exception", regs); } diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index a313e4a9399..14158d40ba7 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S @@ -75,7 +75,7 @@ tsb_miss_page_table_walk: mov 512, %g7 andn %g5, 0x7, %g5 sllx %g7, %g6, %g7 - srlx %g4, HPAGE_SHIFT, %g6 + srlx %g4, REAL_HPAGE_SHIFT, %g6 sub %g7, 1, %g7 and %g6, %g7, %g6 sllx %g6, 4, %g6 diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c index 8201c25e766..3c1a7cb3157 100644 --- a/arch/sparc/kernel/unaligned_64.c +++ b/arch/sparc/kernel/unaligned_64.c @@ -21,9 +21,12 @@ #include <linux/bitops.h> #include <linux/perf_event.h> #include <linux/ratelimit.h> +#include <linux/context_tracking.h> #include <asm/fpumacro.h> #include <asm/cacheflush.h> +#include "entry.h" + enum direction { load, /* ld, ldd, ldh, ldsh */ store, /* st, std, sth, stsh */ @@ -418,9 +421,6 @@ int handle_popc(u32 insn, struct pt_regs *regs) extern void do_fpother(struct pt_regs *regs); extern void do_privact(struct pt_regs *regs); -extern void spitfire_data_access_exception(struct pt_regs *regs, - unsigned long sfsr, - unsigned long sfar); extern void sun4v_data_access_exception(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx); @@ -578,6 +578,7 @@ void handle_ld_nf(u32 insn, struct pt_regs *regs) void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr) { + enum ctx_state prev_state = exception_enter(); unsigned long pc = regs->tpc; unsigned long tstate = regs->tstate; u32 insn; @@ -632,13 +633,16 @@ daex: sun4v_data_access_exception(regs, sfar, sfsr); else spitfire_data_access_exception(regs, sfsr, sfar); - return; + goto out; } advance(regs); +out: + exception_exit(prev_state); } void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr) { + enum ctx_state prev_state = exception_enter(); unsigned long pc = regs->tpc; unsigned long tstate = regs->tstate; u32 insn; @@ -680,7 +684,9 @@ daex: sun4v_data_access_exception(regs, sfar, sfsr); else spitfire_data_access_exception(regs, sfsr, sfar); - return; + goto out; } advance(regs); +out: + exception_exit(prev_state); } diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index 0bacceb1915..932ff90fd76 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -122,6 +122,11 @@ SECTIONS *(.swapper_4m_tsb_phys_patch) __swapper_4m_tsb_phys_patch_end = .; } + .page_offset_shift_patch : { + __page_offset_shift_patch = .; + *(.page_offset_shift_patch) + __page_offset_shift_patch_end = .; + } .popc_3insn_patch : { __popc_3insn_patch = .; *(.popc_3insn_patch) diff --git a/arch/sparc/lib/clear_page.S b/arch/sparc/lib/clear_page.S index 77e531f6c2a..46272dfc26e 100644 --- a/arch/sparc/lib/clear_page.S +++ b/arch/sparc/lib/clear_page.S @@ -37,10 +37,10 @@ _clear_page: /* %o0=dest */ .globl clear_user_page clear_user_page: /* %o0=dest, %o1=vaddr */ lduw [%g6 + TI_PRE_COUNT], %o2 - sethi %uhi(PAGE_OFFSET), %g2 + sethi %hi(PAGE_OFFSET), %g2 sethi %hi(PAGE_SIZE), %o4 - sllx %g2, 32, %g2 + ldx [%g2 + %lo(PAGE_OFFSET)], %g2 sethi %hi(PAGE_KERNEL_LOCKED), %g3 ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3 diff --git a/arch/sparc/lib/copy_page.S b/arch/sparc/lib/copy_page.S index 4d2df328e51..dd16c61f326 100644 --- a/arch/sparc/lib/copy_page.S +++ b/arch/sparc/lib/copy_page.S @@ -46,10 +46,10 @@ .type copy_user_page,#function copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ lduw [%g6 + TI_PRE_COUNT], %o4 - sethi %uhi(PAGE_OFFSET), %g2 + sethi %hi(PAGE_OFFSET), %g2 sethi %hi(PAGE_SIZE), %o3 - sllx %g2, 32, %g2 + ldx [%g2 + %lo(PAGE_OFFSET)], %g2 sethi %hi(PAGE_KERNEL_LOCKED), %g3 ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3 diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 2ebec263d68..69bb818fdd7 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -21,6 +21,7 @@ #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/percpu.h> +#include <linux/context_tracking.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -272,6 +273,7 @@ static void noinline __kprobes bogus_32bit_fault_address(struct pt_regs *regs, asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned int insn = 0; @@ -282,7 +284,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) fault_code = get_thread_fault_code(); if (notify_page_fault(regs)) - return; + goto exit_exception; si_code = SEGV_MAPERR; address = current_thread_info()->fault_address; @@ -313,7 +315,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) /* Valid, no problems... */ } else { bad_kernel_pc(regs, address); - return; + goto exit_exception; } } else flags |= FAULT_FLAG_USER; @@ -430,7 +432,7 @@ good_area: fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) - return; + goto exit_exception; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) @@ -482,6 +484,8 @@ good_area: } #endif +exit_exception: + exception_exit(prev_state); return; /* @@ -494,7 +498,7 @@ bad_area: handle_kernel_fault: do_kernel_fault(regs, si_code, fault_code, insn, address); - return; + goto exit_exception; /* * We ran out of memory, or some other thing happened to us that made @@ -505,7 +509,7 @@ out_of_memory: up_read(&mm->mmap_sem); if (!(regs->tstate & TSTATE_PRIV)) { pagefault_out_of_memory(); - return; + goto exit_exception; } goto handle_kernel_fault; diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 01ee23dd724..c4d3da68b80 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -71,13 +71,12 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, int *nr) { struct page *head, *page, *tail; - u32 mask; int refs; - mask = PMD_HUGE_PRESENT; - if (write) - mask |= PMD_HUGE_WRITE; - if ((pmd_val(pmd) & mask) != mask) + if (!pmd_large(pmd)) + return 0; + + if (write && !pmd_write(pmd)) return 0; refs = 0; diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 96399646570..30963178d7e 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -21,8 +21,6 @@ /* Slightly simplified from the non-hugepage variant because by * definition we don't have to worry about any page coloring stuff */ -#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL)) -#define VA_EXCLUDE_END (0xfffff80000000000UL + (1UL << 32UL)) static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, unsigned long addr, diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index ed82edad1a3..6b643790e4f 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -354,7 +354,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) if (mm->context.huge_pte_count && is_hugetlb_pte(pte)) - __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT, + __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, address, pte_val(pte)); else #endif @@ -1557,6 +1557,96 @@ unsigned long __init find_ecache_flush_span(unsigned long size) return ~0UL; } +unsigned long PAGE_OFFSET; +EXPORT_SYMBOL(PAGE_OFFSET); + +static void __init page_offset_shift_patch_one(unsigned int *insn, unsigned long phys_bits) +{ + unsigned long final_shift; + unsigned int val = *insn; + unsigned int cnt; + + /* We are patching in ilog2(max_supported_phys_address), and + * we are doing so in a manner similar to a relocation addend. + * That is, we are adding the shift value to whatever value + * is in the shift instruction count field already. + */ + cnt = (val & 0x3f); + val &= ~0x3f; + + /* If we are trying to shift >= 64 bits, clear the destination + * register. This can happen when phys_bits ends up being equal + * to MAX_PHYS_ADDRESS_BITS. + */ + final_shift = (cnt + (64 - phys_bits)); + if (final_shift >= 64) { + unsigned int rd = (val >> 25) & 0x1f; + + val = 0x80100000 | (rd << 25); + } else { + val |= final_shift; + } + *insn = val; + + __asm__ __volatile__("flush %0" + : /* no outputs */ + : "r" (insn)); +} + +static void __init page_offset_shift_patch(unsigned long phys_bits) +{ + extern unsigned int __page_offset_shift_patch; + extern unsigned int __page_offset_shift_patch_end; + unsigned int *p; + + p = &__page_offset_shift_patch; + while (p < &__page_offset_shift_patch_end) { + unsigned int *insn = (unsigned int *)(unsigned long)*p; + + page_offset_shift_patch_one(insn, phys_bits); + + p++; + } +} + +static void __init setup_page_offset(void) +{ + unsigned long max_phys_bits = 40; + + if (tlb_type == cheetah || tlb_type == cheetah_plus) { + max_phys_bits = 42; + } else if (tlb_type == hypervisor) { + switch (sun4v_chip_type) { + case SUN4V_CHIP_NIAGARA1: + case SUN4V_CHIP_NIAGARA2: + max_phys_bits = 39; + break; + case SUN4V_CHIP_NIAGARA3: + max_phys_bits = 43; + break; + case SUN4V_CHIP_NIAGARA4: + case SUN4V_CHIP_NIAGARA5: + case SUN4V_CHIP_SPARC64X: + default: + max_phys_bits = 47; + break; + } + } + + if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) { + prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n", + max_phys_bits); + prom_halt(); + } + + PAGE_OFFSET = PAGE_OFFSET_BY_BITS(max_phys_bits); + + pr_info("PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n", + PAGE_OFFSET, max_phys_bits); + + page_offset_shift_patch(max_phys_bits); +} + static void __init tsb_phys_patch(void) { struct tsb_ldquad_phys_patch_entry *pquad; @@ -1722,7 +1812,7 @@ static void __init sun4v_linear_pte_xor_finalize(void) #ifndef CONFIG_DEBUG_PAGEALLOC if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) { kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ - 0xfffff80000000000UL; + PAGE_OFFSET; kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | _PAGE_W_4V); } else { @@ -1731,7 +1821,7 @@ static void __init sun4v_linear_pte_xor_finalize(void) if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) { kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^ - 0xfffff80000000000UL; + PAGE_OFFSET; kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | _PAGE_W_4V); } else { @@ -1740,7 +1830,7 @@ static void __init sun4v_linear_pte_xor_finalize(void) if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) { kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^ - 0xfffff80000000000UL; + PAGE_OFFSET; kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | _PAGE_W_4V); } else { @@ -1752,7 +1842,7 @@ static void __init sun4v_linear_pte_xor_finalize(void) /* paging_init() sets up the page tables */ static unsigned long last_valid_pfn; -pgd_t swapper_pg_dir[2048]; +pgd_t swapper_pg_dir[PTRS_PER_PGD]; static void sun4u_pgprot_init(void); static void sun4v_pgprot_init(void); @@ -1763,6 +1853,8 @@ void __init paging_init(void) unsigned long real_end, i; int node; + setup_page_offset(); + /* These build time checkes make sure that the dcache_dirty_cpu() * page->flags usage will work. * @@ -2261,10 +2353,10 @@ static void __init sun4u_pgprot_init(void) __ACCESS_BITS_4U | _PAGE_E_4U); #ifdef CONFIG_DEBUG_PAGEALLOC - kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; + kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; #else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ - 0xfffff80000000000UL; + PAGE_OFFSET; #endif kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | _PAGE_W_4U); @@ -2308,10 +2400,10 @@ static void __init sun4v_pgprot_init(void) _PAGE_CACHE = _PAGE_CACHE_4V; #ifdef CONFIG_DEBUG_PAGEALLOC - kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; + kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; #else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ - 0xfffff80000000000UL; + PAGE_OFFSET; #endif kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | _PAGE_W_4V); @@ -2455,53 +2547,13 @@ void __flush_tlb_all(void) : : "r" (pstate)); } -static pte_t *get_from_cache(struct mm_struct *mm) -{ - struct page *page; - pte_t *ret; - - spin_lock(&mm->page_table_lock); - page = mm->context.pgtable_page; - ret = NULL; - if (page) { - void *p = page_address(page); - - mm->context.pgtable_page = NULL; - - ret = (pte_t *) (p + (PAGE_SIZE / 2)); - } - spin_unlock(&mm->page_table_lock); - - return ret; -} - -static struct page *__alloc_for_cache(struct mm_struct *mm) -{ - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | - __GFP_REPEAT | __GFP_ZERO); - - if (page) { - spin_lock(&mm->page_table_lock); - if (!mm->context.pgtable_page) { - atomic_set(&page->_count, 2); - mm->context.pgtable_page = page; - } - spin_unlock(&mm->page_table_lock); - } - return page; -} - pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - struct page *page; - pte_t *pte; - - pte = get_from_cache(mm); - if (pte) - return pte; + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | + __GFP_REPEAT | __GFP_ZERO); + pte_t *pte = NULL; - page = __alloc_for_cache(mm); if (page) pte = (pte_t *) page_address(page); @@ -2511,36 +2563,30 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page; - pte_t *pte; - - pte = get_from_cache(mm); - if (pte) - return pte; + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | + __GFP_REPEAT | __GFP_ZERO); + pte_t *pte = NULL; - page = __alloc_for_cache(mm); - if (page) { - pgtable_page_ctor(page); - pte = (pte_t *) page_address(page); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + free_hot_cold_page(page, 0); + return NULL; } - - return pte; + return (pte_t *) page_address(page); } void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - struct page *page = virt_to_page(pte); - if (put_page_testzero(page)) - free_hot_cold_page(page, 0); + free_page((unsigned long)pte); } static void __pte_free(pgtable_t pte) { struct page *page = virt_to_page(pte); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } + + pgtable_page_dtor(page); + __free_page(page); } void pte_free(struct mm_struct *mm, pgtable_t pte) @@ -2557,124 +2603,27 @@ void pgtable_free(void *table, bool is_page) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify) -{ - if (pgprot_val(pgprot) & _PAGE_VALID) - pmd_val(pmd) |= PMD_HUGE_PRESENT; - if (tlb_type == hypervisor) { - if (pgprot_val(pgprot) & _PAGE_WRITE_4V) - pmd_val(pmd) |= PMD_HUGE_WRITE; - if (pgprot_val(pgprot) & _PAGE_EXEC_4V) - pmd_val(pmd) |= PMD_HUGE_EXEC; - - if (!for_modify) { - if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V) - pmd_val(pmd) |= PMD_HUGE_ACCESSED; - if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V) - pmd_val(pmd) |= PMD_HUGE_DIRTY; - } - } else { - if (pgprot_val(pgprot) & _PAGE_WRITE_4U) - pmd_val(pmd) |= PMD_HUGE_WRITE; - if (pgprot_val(pgprot) & _PAGE_EXEC_4U) - pmd_val(pmd) |= PMD_HUGE_EXEC; - - if (!for_modify) { - if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U) - pmd_val(pmd) |= PMD_HUGE_ACCESSED; - if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U) - pmd_val(pmd) |= PMD_HUGE_DIRTY; - } - } - - return pmd; -} - -pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) -{ - pmd_t pmd; - - pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT))); - pmd_val(pmd) |= PMD_ISHUGE; - pmd = pmd_set_protbits(pmd, pgprot, false); - return pmd; -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - pmd_val(pmd) &= ~(PMD_HUGE_PRESENT | - PMD_HUGE_WRITE | - PMD_HUGE_EXEC); - pmd = pmd_set_protbits(pmd, newprot, true); - return pmd; -} - -pgprot_t pmd_pgprot(pmd_t entry) -{ - unsigned long pte = 0; - - if (pmd_val(entry) & PMD_HUGE_PRESENT) - pte |= _PAGE_VALID; - - if (tlb_type == hypervisor) { - if (pmd_val(entry) & PMD_HUGE_PRESENT) - pte |= _PAGE_PRESENT_4V; - if (pmd_val(entry) & PMD_HUGE_EXEC) - pte |= _PAGE_EXEC_4V; - if (pmd_val(entry) & PMD_HUGE_WRITE) - pte |= _PAGE_W_4V; - if (pmd_val(entry) & PMD_HUGE_ACCESSED) - pte |= _PAGE_ACCESSED_4V; - if (pmd_val(entry) & PMD_HUGE_DIRTY) - pte |= _PAGE_MODIFIED_4V; - pte |= _PAGE_CP_4V|_PAGE_CV_4V; - } else { - if (pmd_val(entry) & PMD_HUGE_PRESENT) - pte |= _PAGE_PRESENT_4U; - if (pmd_val(entry) & PMD_HUGE_EXEC) - pte |= _PAGE_EXEC_4U; - if (pmd_val(entry) & PMD_HUGE_WRITE) - pte |= _PAGE_W_4U; - if (pmd_val(entry) & PMD_HUGE_ACCESSED) - pte |= _PAGE_ACCESSED_4U; - if (pmd_val(entry) & PMD_HUGE_DIRTY) - pte |= _PAGE_MODIFIED_4U; - pte |= _PAGE_CP_4U|_PAGE_CV_4U; - } - - return __pgprot(pte); -} - void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { unsigned long pte, flags; struct mm_struct *mm; pmd_t entry = *pmd; - pgprot_t prot; if (!pmd_large(entry) || !pmd_young(entry)) return; - pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS); - pte <<= PMD_PADDR_SHIFT; - pte |= _PAGE_VALID; - - prot = pmd_pgprot(entry); - - if (tlb_type == hypervisor) - pgprot_val(prot) |= _PAGE_SZHUGE_4V; - else - pgprot_val(prot) |= _PAGE_SZHUGE_4U; + pte = pmd_val(entry); - pte |= pgprot_val(prot); + /* We are fabricating 8MB pages using 4MB real hw pages. */ + pte |= (addr & (1UL << REAL_HPAGE_SHIFT)); mm = vma->vm_mm; spin_lock_irqsave(&mm->context.lock, flags); if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) - __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT, + __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, addr, pte); spin_unlock_irqrestore(&mm->context.lock, flags); diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h index 0661aa606de..5d3782deb40 100644 --- a/arch/sparc/mm/init_64.h +++ b/arch/sparc/mm/init_64.h @@ -1,11 +1,13 @@ #ifndef _SPARC64_MM_INIT_H #define _SPARC64_MM_INIT_H +#include <asm/page.h> + /* Most of the symbols in this file are defined in init.c and * marked non-static so that assembler code can get at them. */ -#define MAX_PHYS_ADDRESS (1UL << 41UL) +#define MAX_PHYS_ADDRESS (1UL << MAX_PHYS_ADDRESS_BITS) #define KPTE_BITMAP_CHUNK_SZ (256UL * 1024UL * 1024UL) #define KPTE_BITMAP_BYTES \ ((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 4) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 5d721df48a7..869023abe5a 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -345,7 +345,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) if ((pte = (unsigned long)pte_alloc_one_kernel(mm, address)) == 0) return NULL; page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); - pgtable_page_ctor(page); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } return page; } diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 7a91f288c70..ad3bf4b4324 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -161,8 +161,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, if (mm == &init_mm) return; - if ((pmd_val(pmd) ^ pmd_val(orig)) & PMD_ISHUGE) { - if (pmd_val(pmd) & PMD_ISHUGE) + if ((pmd_val(pmd) ^ pmd_val(orig)) & _PAGE_PMD_HUGE) { + if (pmd_val(pmd) & _PAGE_PMD_HUGE) mm->context.huge_pte_count++; else mm->context.huge_pte_count--; @@ -178,13 +178,16 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } if (!pmd_none(orig)) { - bool exec = ((pmd_val(orig) & PMD_HUGE_EXEC) != 0); + pte_t orig_pte = __pte(pmd_val(orig)); + bool exec = pte_exec(orig_pte); addr &= HPAGE_MASK; - if (pmd_val(orig) & PMD_ISHUGE) + if (pmd_trans_huge(orig)) { tlb_batch_add_one(mm, addr, exec); - else + tlb_batch_add_one(mm, addr + REAL_HPAGE_SIZE, exec); + } else { tlb_batch_pmd_scan(mm, addr, orig, exec); + } } } @@ -196,11 +199,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(lh); else - list_add(lh, (struct list_head *) mm->pmd_huge_pte); - mm->pmd_huge_pte = pgtable; + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) @@ -211,12 +214,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); lh = (struct list_head *) pgtable; if (list_empty(lh)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = (pgtable_t) lh->next; + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; list_del(lh); } pte_val(pgtable[0]) = 0; diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index 2cc3bce5ee9..3b3a360b429 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -87,7 +87,7 @@ void flush_tsb_user(struct tlb_batch *tb) nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) base = __pa(base); - __flush_tsb_one(tb, HPAGE_SHIFT, base, nentries); + __flush_tsb_one(tb, REAL_HPAGE_SHIFT, base, nentries); } #endif spin_unlock_irqrestore(&mm->context.lock, flags); @@ -111,7 +111,7 @@ void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr) nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) base = __pa(base); - __flush_tsb_one_entry(base, vaddr, HPAGE_SHIFT, nentries); + __flush_tsb_one_entry(base, vaddr, REAL_HPAGE_SHIFT, nentries); } #endif spin_unlock_irqrestore(&mm->context.lock, flags); @@ -472,8 +472,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) mm->context.huge_pte_count = 0; #endif - mm->context.pgtable_page = NULL; - /* copy_mm() copies over the parent's mm_struct before calling * us, so we need to zero out the TSB pointer or else tsb_grow() * will be confused and think there is an older TSB to free up. @@ -512,17 +510,10 @@ static void tsb_destroy_one(struct tsb_config *tp) void destroy_context(struct mm_struct *mm) { unsigned long flags, i; - struct page *page; for (i = 0; i < MM_NUM_TSBS; i++) tsb_destroy_one(&mm->context.tsb_block[i]); - page = mm->context.pgtable_page; - if (page && put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } - spin_lock_irqsave(&ctx_alloc_lock, flags); if (CTX_VALID(mm->context)) { diff --git a/arch/sparc/mm/ultra.S b/arch/sparc/mm/ultra.S index 432aa0cb1b3..b4f4733abc6 100644 --- a/arch/sparc/mm/ultra.S +++ b/arch/sparc/mm/ultra.S @@ -153,10 +153,10 @@ __spitfire_flush_tlb_mm_slow: .globl __flush_icache_page __flush_icache_page: /* %o0 = phys_page */ srlx %o0, PAGE_SHIFT, %o0 - sethi %uhi(PAGE_OFFSET), %g1 + sethi %hi(PAGE_OFFSET), %g1 sllx %o0, PAGE_SHIFT, %o0 sethi %hi(PAGE_SIZE), %g2 - sllx %g1, 32, %g1 + ldx [%g1 + %lo(PAGE_OFFSET)], %g1 add %o0, %g1, %o0 1: subcc %g2, 32, %g2 bne,pt %icc, 1b @@ -178,8 +178,8 @@ __flush_icache_page: /* %o0 = phys_page */ .align 64 .globl __flush_dcache_page __flush_dcache_page: /* %o0=kaddr, %o1=flush_icache */ - sethi %uhi(PAGE_OFFSET), %g1 - sllx %g1, 32, %g1 + sethi %hi(PAGE_OFFSET), %g1 + ldx [%g1 + %lo(PAGE_OFFSET)], %g1 sub %o0, %g1, %o0 ! physical address srlx %o0, 11, %o0 ! make D-cache TAG sethi %hi(1 << 14), %o2 ! D-cache size @@ -287,8 +287,8 @@ __cheetah_flush_tlb_pending: /* 27 insns */ #ifdef DCACHE_ALIASING_POSSIBLE __cheetah_flush_dcache_page: /* 11 insns */ - sethi %uhi(PAGE_OFFSET), %g1 - sllx %g1, 32, %g1 + sethi %hi(PAGE_OFFSET), %g1 + ldx [%g1 + %lo(PAGE_OFFSET)], %g1 sub %o0, %g1, %o0 sethi %hi(PAGE_SIZE), %o4 1: subcc %o4, (1 << 5), %o4 diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index d45a2c48f18..b3692ce78f9 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -8,7 +8,6 @@ config TILE select HAVE_KVM if !TILEGX select GENERIC_FIND_FIRST_BIT select SYSCTL_EXCEPTION_TRACE - select USE_GENERIC_SMP_HELPERS select CC_OPTIMIZE_FOR_SIZE select HAVE_DEBUG_KMEMLEAK select GENERIC_IRQ_PROBE diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 4fd9ec0b58e..5e86eac4bfa 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -241,6 +241,11 @@ struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address, if (p == NULL) return NULL; + if (!pgtable_page_ctor(p)) { + __free_pages(p, L2_USER_PGTABLE_ORDER); + return NULL; + } + /* * Make every page have a page_count() of one, not just the first. * We don't use __GFP_COMP since it doesn't look like it works @@ -251,7 +256,6 @@ struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address, inc_zone_page_state(p+i, NR_PAGETABLE); } - pgtable_page_ctor(p); return p; } diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 7ddb64baf32..8636e905426 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -279,8 +279,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (pte) - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 0213e373a89..2e02d1356fd 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -51,12 +51,14 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr) struct page *pte; pte = alloc_pages(PGALLOC_GFP, 0); - if (pte) { - if (!PageHighMem(pte)) { - void *page = page_address(pte); - clean_dcache_area(page, PTRS_PER_PTE * sizeof(pte_t)); - } - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!PageHighMem(pte)) { + void *page = page_address(pte); + clean_dcache_area(page, PTRS_PER_PTE * sizeof(pte_t)); + } + if (!pgtable_page_ctor(pte)) { + __free_page(pte); } return pte; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6e3e1cb3f6a..83f521aa103 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -90,7 +90,6 @@ config X86 select GENERIC_IRQ_SHOW select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING - select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE select CLKEVT_I8253 @@ -1885,6 +1884,10 @@ config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + def_bool y + depends on X86_64 || X86_PAE + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 15f960c06ff..24ec1216596 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -274,13 +274,17 @@ struct x86_emulate_ctxt { bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ - bool only_vendor_specific_insn; + bool ud; /* inject an #UD if host doesn't support insn */ bool have_exception; struct x86_exception exception; - /* decode cache */ - u8 twobyte; + /* + * decode cache + */ + + /* current opcode length in bytes */ + u8 opcode_len; u8 b; u8 intercept; u8 lock_prefix; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c76ff74a98f..ae5d7830855 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -79,6 +79,13 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + #define SELECTOR_TI_MASK (1 << 2) #define SELECTOR_RPL_MASK 0x03 @@ -253,7 +260,6 @@ struct kvm_pio_request { * mode. */ struct kvm_mmu { - void (*new_cr3)(struct kvm_vcpu *vcpu); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); @@ -261,7 +267,6 @@ struct kvm_mmu { bool prefault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); - void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, struct x86_exception *exception); gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); @@ -389,6 +394,8 @@ struct kvm_vcpu_arch { struct fpu guest_fpu; u64 xcr0; + u64 guest_supported_xcr0; + u32 guest_xstate_size; struct kvm_pio_request pio; void *pio_data; @@ -557,7 +564,9 @@ struct kvm_arch { struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; - int iommu_flags; + bool iommu_noncoherent; +#define __KVM_HAVE_ARCH_NONCOHERENT_DMA + atomic_t noncoherent_dma_count; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; @@ -780,11 +789,11 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_setup(struct kvm_vcpu *vcpu); +void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); +void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -922,13 +931,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); void kvm_enable_tdp(void); void kvm_disable_tdp(void); -int complete_pio(struct kvm_vcpu *vcpu); -bool kvm_check_iopl(struct kvm_vcpu *vcpu); - static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) { return gpa; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index b4389a468fb..c4412e972bb 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -80,12 +80,21 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #if PAGETABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + struct page *page; + page = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_pages(page, 0); + return NULL; + } + return (pmd_t *)page_address(page); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + pgtable_pmd_page_dtor(virt_to_page(pmd)); free_page((unsigned long)pmd); } diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index be8269b00e2..d6b078e9fa2 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -14,6 +14,8 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct timespec *ts); void pvclock_resume(void); +void pvclock_touch_watchdogs(void); + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h new file mode 100644 index 00000000000..7f02fe4e2c7 --- /dev/null +++ b/arch/x86/include/asm/xen/page-coherent.h @@ -0,0 +1,38 @@ +#ifndef _ASM_X86_XEN_PAGE_COHERENT_H +#define _ASM_X86_XEN_PAGE_COHERENT_H + +#include <asm/page.h> +#include <linux/dma-attrs.h> +#include <linux/dma-mapping.h> + +static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) +{ + void *vstart = (void*)__get_free_pages(flags, get_order(size)); + *dma_handle = virt_to_phys(vstart); + return vstart; +} + +static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +static inline void xen_dma_map_page(struct device *hwdev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { } + +static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { } + +static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) { } + +static inline void xen_dma_sync_single_for_device(struct device *hwdev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) { } + +#endif /* _ASM_X86_XEN_PAGE_COHERENT_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5d9a3033b3d..d3a87780c70 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 { __u32 padding[3]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) /* for KVM_SET_CPUID2 */ struct kvm_cpuid2 { diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index bb0465090ae..b93e09a0fa2 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -536,6 +536,7 @@ /* MSR_IA32_VMX_MISC bits */ #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1570e074134..e6041094ff2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -139,6 +139,7 @@ bool kvm_check_and_clear_guest_paused(void) src = &hv_clock[cpu].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); ret = true; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index a16bae3f83b..2f355d229a5 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -43,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) return pv_tsc_khz; } +void pvclock_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + reset_hung_task_detector(); +} + static atomic64_t last_value = ATOMIC64_INIT(0); void pvclock_resume(void) @@ -74,6 +82,11 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) version = __pvclock_read_cycles(src, &ret, &flags); } while ((src->version & 1) || version != src->version); + if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + } + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && (flags & PVCLOCK_TSC_STABLE_BIT)) return ret; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a47a3e54b96..b89c5db2b83 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -38,6 +38,7 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select KVM_VFIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index bf4fb04d011..25d22b2d650 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -9,7 +9,7 @@ KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b110fe6c03d..c6976257eff 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -23,6 +23,26 @@ #include "mmu.h" #include "trace.h" +static u32 xstate_required_size(u64 xstate_bv) +{ + int feature_bit = 0; + u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + + xstate_bv &= ~XSTATE_FPSSE; + while (xstate_bv) { + if (xstate_bv & 0x1) { + u32 eax, ebx, ecx, edx; + cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); + ret = max(ret, eax + ebx); + } + + xstate_bv >>= 1; + feature_bit++; + } + + return ret; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -46,6 +66,18 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) { + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + } else { + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & + host_xcr0 & KVM_SUPPORTED_XCR0; + vcpu->arch.guest_xstate_size = + xstate_required_size(vcpu->arch.guest_supported_xcr0); + } + kvm_pmu_cpuid_update(vcpu); } @@ -182,13 +214,35 @@ static bool supported_xcr0_bit(unsigned bit) { u64 mask = ((u64)1 << bit); - return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; + return mask & KVM_SUPPORTED_XCR0 & host_xcr0; } #define F(x) bit(X86_FEATURE_##x) -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, + u32 func, u32 index, int *nent, int maxnent) +{ + switch (func) { + case 0: + entry->eax = 1; /* only one leaf currently */ + ++*nent; + break; + case 1: + entry->ecx = F(MOVBE); + ++*nent; + break; + default: + break; + } + + entry->function = func; + entry->index = index; + + return 0; +} + +static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) { int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; @@ -383,6 +437,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 0xd: { int idx, i; + entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0; + entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { if (*nent >= maxnent) @@ -481,6 +537,15 @@ out: return r; } +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, + u32 idx, int *nent, int maxnent, unsigned int type) +{ + if (type == KVM_GET_EMULATED_CPUID) + return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); + + return __do_cpuid_ent(entry, func, idx, nent, maxnent); +} + #undef F struct kvm_cpuid_param { @@ -495,8 +560,36 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param) return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; } -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, + __u32 num_entries, unsigned int ioctl_type) +{ + int i; + __u32 pad[3]; + + if (ioctl_type != KVM_GET_EMULATED_CPUID) + return false; + + /* + * We want to make sure that ->padding is being passed clean from + * userspace in case we want to use it for something in the future. + * + * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we + * have to give ourselves satisfied only with the emulated side. /me + * sheds a tear. + */ + for (i = 0; i < num_entries; i++) { + if (copy_from_user(pad, entries[i].padding, sizeof(pad))) + return true; + + if (pad[0] || pad[1] || pad[2]) + return true; + } + return false; +} + +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; @@ -513,8 +606,12 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, goto out; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; + + if (sanity_check_entries(entries, cpuid->nent, type)) + return -EINVAL; + r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); if (!cpuid_entries) goto out; @@ -526,7 +623,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, continue; r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -537,7 +634,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -661,6 +758,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) *edx = best->edx; } else *eax = *ebx = *ecx = *edx = 0; + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); } EXPORT_SYMBOL_GPL(kvm_cpuid); @@ -676,6 +774,5 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, eax, ebx, ecx, edx); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b7fd0798488..f1e4895174b 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -6,8 +6,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ddc3f3d2afd..07ffca0a89e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -130,7 +130,7 @@ #define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ -#define VendorSpecific (1<<22) /* Vendor specific instruction */ +#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -785,9 +785,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, - int highbyte_regs) + int byteop) { void *p; + int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; @@ -1024,7 +1025,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { unsigned reg = ctxt->modrm_reg; - int highbyte_regs = ctxt->rex_prefix == 0; if (!(ctxt->d & ModRM)) reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); @@ -1045,13 +1045,9 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } op->type = OP_REG; - if (ctxt->d & ByteOp) { - op->addr.reg = decode_register(ctxt, reg, highbyte_regs); - op->bytes = 1; - } else { - op->addr.reg = decode_register(ctxt, reg, 0); - op->bytes = ctxt->op_bytes; - } + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); + fetch_register_operand(op); op->orig_val = op->val; } @@ -1082,12 +1078,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3) { - int highbyte_regs = ctxt->rex_prefix == 0; - op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, - highbyte_regs && (ctxt->d & ByteOp)); + ctxt->d & ByteOp); if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; @@ -2961,6 +2955,46 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +#define FFL(x) bit(X86_FEATURE_##x) + +static int em_movbe(struct x86_emulate_ctxt *ctxt) +{ + u32 ebx, ecx, edx, eax = 1; + u16 tmp; + + /* + * Check MOVBE is set in the guest-visible CPUID leaf. + */ + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(ecx & FFL(MOVBE))) + return emulate_ud(ctxt); + + switch (ctxt->op_bytes) { + case 2: + /* + * From MOVBE definition: "...When the operand size is 16 bits, + * the upper word of the destination register remains unchanged + * ..." + * + * Both casting ->valptr and ->val to u16 breaks strict aliasing + * rules so we have to do the operation almost per hand. + */ + tmp = (u16)ctxt->src.val; + ctxt->dst.val &= ~0xffffUL; + ctxt->dst.val |= (unsigned long)swab16(tmp); + break; + case 4: + ctxt->dst.val = swab32((u32)ctxt->src.val); + break; + case 8: + ctxt->dst.val = swab64(ctxt->src.val); + break; + default: + return X86EMUL_PROPAGATE_FAULT; + } + return X86EMUL_CONTINUE; +} + static int em_cr_write(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) @@ -3256,6 +3290,18 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_sahf(struct x86_emulate_ctxt *ctxt) +{ + u32 flags; + + flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; + flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; + + ctxt->eflags &= ~0xffUL; + ctxt->eflags |= flags | X86_EFLAGS_FIXED; + return X86EMUL_CONTINUE; +} + static int em_lahf(struct x86_emulate_ctxt *ctxt) { *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL; @@ -3502,7 +3548,7 @@ static const struct opcode group7_rm1[] = { static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), + II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | Prot | Priv, stgi, check_svme), @@ -3587,7 +3633,7 @@ static const struct group_dual group7 = { { II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | Priv | VendorSpecific, em_vmcall), + I(SrcNone | Priv | EmulateOnUD, em_vmcall), EXT(0, group7_rm1), N, EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3750,7 +3796,8 @@ static const struct opcode opcode_table[256] = { D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), + II(ImplicitOps | Stack, em_popf, popf), + I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), @@ -3810,7 +3857,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | VendorSpecific, em_syscall), + N, I(ImplicitOps | EmulateOnUD, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, @@ -3830,8 +3877,8 @@ static const struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | VendorSpecific, em_sysenter), - I(ImplicitOps | Priv | VendorSpecific, em_sysexit), + I(ImplicitOps | EmulateOnUD, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -3892,6 +3939,30 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; +static const struct gprefix three_byte_0f_38_f0 = { + I(DstReg | SrcMem | Mov, em_movbe), N, N, N +}; + +static const struct gprefix three_byte_0f_38_f1 = { + I(DstMem | SrcReg | Mov, em_movbe), N, N, N +}; + +/* + * Insns below are selected by the prefix which indexed by the third opcode + * byte. + */ +static const struct opcode opcode_map_0f_38[256] = { + /* 0x00 - 0x7f */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0x80 - 0xef */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0xf0 - 0xf1 */ + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0), + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1), + /* 0xf2 - 0xff */ + N, N, X4(N), X8(N) +}; + #undef D #undef N #undef G @@ -4040,7 +4111,8 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpMem8: ctxt->memop.bytes = 1; if (ctxt->memop.type == OP_REG) { - ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1); + ctxt->memop.addr.reg = decode_register(ctxt, + ctxt->modrm_rm, true); fetch_register_operand(&ctxt->memop); } goto mem_common; @@ -4126,6 +4198,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->_eip = ctxt->eip; ctxt->fetch.start = ctxt->_eip; ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); @@ -4208,9 +4281,16 @@ done_prefixes: opcode = opcode_table[ctxt->b]; /* Two-byte opcode? */ if (ctxt->b == 0x0f) { - ctxt->twobyte = 1; + ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; + + /* 0F_38 opcode map */ + if (ctxt->b == 0x38) { + ctxt->opcode_len = 3; + ctxt->b = insn_fetch(u8, ctxt); + opcode = opcode_map_0f_38[ctxt->b]; + } } ctxt->d = opcode.flags; @@ -4267,7 +4347,7 @@ done_prefixes: if (ctxt->d == 0 || (ctxt->d & NotImpl)) return EMULATION_FAILED; - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + if (!(ctxt->d & EmulateOnUD) && ctxt->ud) return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) @@ -4540,8 +4620,10 @@ special_insn: goto writeback; } - if (ctxt->twobyte) + if (ctxt->opcode_len == 2) goto twobyte_insn; + else if (ctxt->opcode_len == 3) + goto threebyte_insn; switch (ctxt->b) { case 0x63: /* movsxd */ @@ -4726,6 +4808,8 @@ twobyte_insn: goto cannot_emulate; } +threebyte_insn: + if (rc != X86EMUL_CONTINUE) goto done; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dce0df8150d..40772ef0f2b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2570,11 +2570,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_release_pfn_clean(pfn); } -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -3424,18 +3419,11 @@ out_unlock: return 0; } -static void nonpaging_free(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3444,7 +3432,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->root_hpa = INVALID_PAGE; context->direct_map = true; context->nx = false; - return 0; } void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) @@ -3454,9 +3441,8 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); -static void paging_new_cr3(struct kvm_vcpu *vcpu) +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); mmu_free_roots(vcpu); } @@ -3471,11 +3457,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { @@ -3665,9 +3646,9 @@ static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->last_pte_bitmap = map; } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static void paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { context->nx = is_nx(vcpu); context->root_level = level; @@ -3677,27 +3658,24 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; - context->free = paging_free; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { context->nx = false; context->root_level = PT32_ROOT_LEVEL; @@ -3706,33 +3684,28 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } -static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = vcpu->arch.walk_mmu; context->base_role.word = 0; - context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3767,37 +3740,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - - return 0; } -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - int r; bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu, context); + nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu, context); + paging64_init_context(vcpu, context); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu, context); + paging32E_init_context(vcpu, context); else - r = paging32_init_context(vcpu, context); + paging32_init_context(vcpu, context); vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); - - return r; } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { ASSERT(vcpu); @@ -3806,37 +3774,30 @@ int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->nx = true; - context->new_cr3 = paging_new_cr3; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; - context->free = paging_free; context->root_level = context->shadow_root_level; context->root_hpa = INVALID_PAGE; context->direct_map = false; update_permission_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); - - return 0; } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - + kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.walk_mmu->get_cr3 = get_cr3; vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - - return r; } -static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; @@ -3873,11 +3834,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); - - return 0; } -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); @@ -3887,18 +3846,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) return init_kvm_softmmu(vcpu); } -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - /* mmu.free() should set root_hpa = INVALID_PAGE */ - vcpu->arch.mmu.free(vcpu); -} -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); + init_kvm_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -3923,6 +3876,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); + WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4281,12 +4235,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) return alloc_mmu_pages(vcpu); } -int kvm_mmu_setup(struct kvm_vcpu *vcpu) +void kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - return init_kvm_mmu(vcpu); + init_kvm_mmu(vcpu); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) @@ -4428,7 +4382,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -4478,9 +4432,8 @@ unlock: break; } - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return freed; - } static unsigned long @@ -4574,7 +4527,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - destroy_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 77e044a0f5f..29261527435 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -70,8 +70,8 @@ enum { }; int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c0bc80391e4..c7168a5cff1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1959,11 +1959,9 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, nested_svm_vmexit(svm); } -static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - int r; - - r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; @@ -1971,8 +1969,6 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; } static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2b2fce1b200..b2fe1c252f3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1498,7 +1498,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; if (i == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING"Not enough mst switch entries. " + printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; } else if (i == m->nr) { @@ -1898,16 +1898,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. - * This function assumes it is called with the exit reason in vmcs02 being - * a #PF exception (this is the only case in which KVM injects a #PF when L2 - * is running). */ -static int nested_pf_handled(struct kvm_vcpu *vcpu) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) + if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; nested_vmx_vmexit(vcpu); @@ -1921,8 +1917,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (nr == PF_VECTOR && is_guest_mode(vcpu) && - !vmx->nested.nested_run_pending && nested_pf_handled(vcpu)) + if (!reinject && is_guest_mode(vcpu) && + nested_vmx_check_exception(vcpu, nr)) return; if (has_error_code) { @@ -2204,9 +2200,15 @@ static __init void nested_vmx_setup_ctls_msrs(void) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || + !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { + nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + } nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER); + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2226,7 +2228,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); nested_vmx_procbased_ctls_low = 0; nested_vmx_procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | @@ -2252,13 +2255,15 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_WBINVD_EXITING; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_INVEPT_BIT; nested_vmx_ept_caps &= vmx_capability.ept; /* * Since invept is completely emulated we support both global @@ -3380,8 +3385,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : - vcpu->kvm->arch.ept_identity_map_addr; + if (is_paging(vcpu) || is_guest_mode(vcpu)) + guest_cr3 = kvm_read_cr3(vcpu); + else + guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -4879,6 +4886,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +{ + unsigned long always_on = VMXON_CR0_ALWAYSON; + + if (nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + always_on &= ~(X86_CR0_PE | X86_CR0_PG); + return (val & always_on) == always_on; +} + /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -4897,9 +4915,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - /* TODO: will have to take unrestricted guest mode into - * account */ - if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) + if (!nested_cr0_valid(vmcs12, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -6627,6 +6643,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; else if (is_page_fault(intr_info)) return enable_ept; + else if (is_no_device(intr_info) && + !(nested_read_cr0(vmcs12) & X86_CR0_TS)) + return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -6722,6 +6741,27 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 delta_tsc_l1; + u32 preempt_val_l1, preempt_val_l2, preempt_scale; + + if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER)) + return; + preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) & + MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE; + preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc()) + - vcpu->arch.last_guest_tsc; + preempt_val_l1 = delta_tsc_l1 >> preempt_scale; + if (preempt_val_l2 <= preempt_val_l1) + preempt_val_l2 = 0; + else + preempt_val_l2 -= preempt_val_l1; + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -6736,20 +6776,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || - exit_reason == EXIT_REASON_VMRESUME)) - vmx->nested.nested_run_pending = 1; - else - vmx->nested.nested_run_pending = 0; - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { nested_vmx_vmexit(vcpu); return 1; @@ -7061,9 +7087,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); - kvm_queue_exception_e(vcpu, vector, err); + kvm_requeue_exception_e(vcpu, vector, err); } else - kvm_queue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); @@ -7146,6 +7172,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); + if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) + nested_adjust_preemption_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -7284,6 +7312,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + /* + * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * we did not inject a still-pending event to L1 now because of + * nested_run_pending, we need to re-enable this bit. + */ + if (vmx->nested.nested_run_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vmx->nested.nested_run_pending = 0; + vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); @@ -7410,8 +7448,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) */ if (is_mmio) ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) ret = kvm_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; else @@ -7501,9 +7538,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; @@ -7511,8 +7548,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -7520,6 +7555,20 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ + if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + nested_vmx_vmexit(vcpu); + else + kvm_inject_page_fault(vcpu, fault); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7533,6 +7582,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; + u32 exit_control; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -7706,7 +7756,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + exit_control = vmcs_config.vmexit_ctrl; + if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) + exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + vmcs_write32(VM_EXIT_CONTROLS, exit_control); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -7773,6 +7826,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + /* * L1 may access the L2's PDPTR, so save them to construct vmcs12 */ @@ -7876,7 +7932,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || + if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -7938,6 +7994,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enter_guest_mode(vcpu); + vmx->nested.nested_run_pending = 1; + vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); cpu = get_cpu(); @@ -8005,7 +8063,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, u32 idt_vectoring; unsigned int nr; - if (vcpu->arch.exception.pending) { + if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { nr = vcpu->arch.exception.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -8023,7 +8081,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } vmcs12->idt_vectoring_info_field = idt_vectoring; - } else if (vcpu->arch.nmi_pending) { + } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; } else if (vcpu->arch.interrupt.pending) { @@ -8105,6 +8163,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && + (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) + vmcs12->vmx_preemption_timer_value = + vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + /* * In some cases (usually, nested EPT), L2 is allowed to change its * own CR3 without exiting. If it has changed it, we must keep it. @@ -8130,6 +8193,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); @@ -8201,7 +8266,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * fpu_active (which may have changed). * Note that vmx_set_cr0 refers to efer set above. */ - kvm_set_cr0(vcpu, vmcs12->host_cr0); + vmx_set_cr0(vcpu, vmcs12->host_cr0); /* * If we did fpu_activate()/fpu_deactivate() during L2's run, we need * to apply the same changes to L1's vmcs. We just set cr0 correctly, @@ -8224,6 +8289,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + if (enable_vpid) { /* * Trivially support vpid by letting L2s share their parent diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5ca72a5cdb..21ef1ba184a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -577,6 +577,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0; + u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) @@ -586,8 +587,16 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) return 1; - if (xcr0 & ~host_xcr0) + + /* + * Do not allow the guest to set bits that we do not support + * saving. However, xcr0 bit 0 is always set, even if the + * emulated CPU does not support XSAVE (see fx_init). + */ + valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP; + if (xcr0 & ~valid_bits) return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; return 0; @@ -684,7 +693,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - vcpu->arch.mmu.new_cr3(vcpu); + kvm_mmu_new_cr3(vcpu); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -2564,6 +2573,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: @@ -2673,15 +2683,17 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_SUPPORTED_CPUID: { + case KVM_GET_SUPPORTED_CPUID: + case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + + r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, + ioctl); if (r) goto out; @@ -2715,8 +2727,7 @@ static void wbinvd_ipi(void *garbage) static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); + return kvm_arch_has_noncoherent_dma(vcpu->kvm); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -2984,11 +2995,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (cpu_has_xsave) + if (cpu_has_xsave) { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->xsave, - xstate_size); - else { + vcpu->arch.guest_xstate_size); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &= + vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE; + } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->fxsave, sizeof(struct i387_fxsave_struct)); @@ -3003,10 +3016,19 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - if (cpu_has_xsave) + if (cpu_has_xsave) { + /* + * Here we allow setting states that are not present in + * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility + * with old userspace. + */ + if (xstate_bv & ~KVM_SUPPORTED_XCR0) + return -EINVAL; + if (xstate_bv & ~host_xcr0) + return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, xstate_size); - else { + guest_xsave->region, vcpu->arch.guest_xstate_size); + } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->fxsave, @@ -3042,9 +3064,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, for (i = 0; i < guest_xcrs->nr_xcrs; i++) /* Only support XCR0 currently */ - if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, - guest_xcrs->xcrs[0].value); + guest_xcrs->xcrs[i].value); break; } if (r) @@ -4775,8 +4797,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static void init_decode_cache(struct x86_emulate_ctxt *ctxt) { - memset(&ctxt->twobyte, 0, - (void *)&ctxt->_regs - (void *)&ctxt->twobyte); + memset(&ctxt->opcode_len, 0, + (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); ctxt->fetch.start = 0; ctxt->fetch.end = 0; @@ -5094,8 +5116,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ctxt->have_exception = false; ctxt->perm_ok = false; - ctxt->only_vendor_specific_insn - = emulation_type & EMULTYPE_TRAP_UD; + ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; r = x86_decode_insn(ctxt, insn, insn_len); @@ -5263,7 +5284,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -5273,7 +5294,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -5426,12 +5447,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); atomic_set(&kvm_guest_has_master_clock, 0); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -5945,10 +5966,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = IN_GUEST_MODE; + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* We should set ->mode before check ->requests, * see the comment in make_all_cpus_request. */ - smp_mb(); + smp_mb__after_srcu_read_unlock(); local_irq_disable(); @@ -5958,12 +5981,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = 1; goto cancel_injection; } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); @@ -6688,7 +6710,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (r) return r; kvm_vcpu_reset(vcpu); - r = kvm_mmu_setup(vcpu); + kvm_mmu_setup(vcpu); vcpu_put(vcpu); return r; @@ -6940,6 +6962,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.ia32_tsc_adjust_msr = 0x0; vcpu->arch.pv_time_enabled = false; + + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); @@ -6981,6 +7007,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); @@ -7065,7 +7092,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { int i; @@ -7086,7 +7113,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, } } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { int i; @@ -7283,7 +7311,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) int r; if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - is_error_page(work->page)) + work->wakeup_all) return; r = kvm_mmu_reload(vcpu); @@ -7393,7 +7421,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct x86_exception fault; trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (is_error_page(work->page)) + if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); @@ -7420,6 +7448,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) kvm_x86_ops->interrupt_allowed(vcpu); } +void kvm_arch_register_noncoherent_dma(struct kvm *kvm) +{ + atomic_inc(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); + +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) +{ + atomic_dec(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); + +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e224f7a671b..587fb9ede43 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,6 +122,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) extern u64 host_xcr0; extern struct static_key kvm_no_apic_vcpu; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfa537a03be..a7cccb6d7fe 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; pte = alloc_pages(__userpte_alloc_gfp, 0); - if (pte) - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } @@ -189,8 +193,10 @@ static void free_pmds(pmd_t *pmds[]) int i; for(i = 0; i < PREALLOCATED_PMDS; i++) - if (pmds[i]) + if (pmds[i]) { + pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + } } static int preallocate_pmds(pmd_t *pmds[]) @@ -200,8 +206,13 @@ static int preallocate_pmds(pmd_t *pmds[]) for(i = 0; i < PREALLOCATED_PMDS; i++) { pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); - if (pmd == NULL) + if (!pmd) failed = true; + if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { + free_page((unsigned long)pmds[i]); + pmd = NULL; + failed = true; + } pmds[i] = pmd; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index fdc3ba28ca3..ce563be09cc 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -468,8 +468,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); * 3 PCD PWT UC UC UC * 4 PAT WB WC WB * 5 PAT PWT WC WP WT - * 6 PAT PCD UC- UC UC- - * 7 PAT PCD PWT UC UC UC + * 6 PAT PCD UC- rsv UC- + * 7 PAT PCD PWT UC rsv UC */ void xen_set_pat(u64 pat) @@ -796,8 +796,8 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if USE_SPLIT_PTLOCKS - ptl = __pte_lockptr(page); +#if USE_SPLIT_PTE_PTLOCKS + ptl = ptlock_ptr(page); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -1637,7 +1637,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTLOCKS) + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -1671,7 +1671,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) if (!PageHighMem(page)) { xen_mc_batch(); - if (level == PT_PTE && USE_SPLIT_PTLOCKS) + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); __set_pfn_prot(pfn, PAGE_KERNEL); @@ -2328,12 +2328,14 @@ static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, return success; } -int xen_create_contiguous_region(unsigned long vstart, unsigned int order, - unsigned int address_bits) +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle) { unsigned long *in_frames = discontig_frames, out_frame; unsigned long flags; int success; + unsigned long vstart = (unsigned long)phys_to_virt(pstart); /* * Currently an auto-translated guest will not perform I/O, nor will @@ -2368,15 +2370,17 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, spin_unlock_irqrestore(&xen_reservation_lock, flags); + *dma_handle = virt_to_machine(vstart).maddr; return success ? 0 : -ENOMEM; } EXPORT_SYMBOL_GPL(xen_create_contiguous_region); -void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { unsigned long *out_frames = discontig_frames, in_frame; unsigned long flags; int success; + unsigned long vstart; if (xen_feature(XENFEAT_auto_translated_physmap)) return; @@ -2384,6 +2388,7 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) if (unlikely(order > MAX_CONTIG_ORDER)) return; + vstart = (unsigned long)phys_to_virt(pstart); memset((void *) vstart, 0, PAGE_SIZE << order); spin_lock_irqsave(&xen_reservation_lock, flags); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index a61c7d5811b..2ae8699e876 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -799,10 +799,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, mididx, idx; - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + /* don't track P2M changes in autotranslate guests */ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) return true; - } + if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 969570491c3..0e98e5d241d 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -75,8 +75,10 @@ void __init pci_xen_swiotlb_init(void) xen_swiotlb_init(1, true /* early */); dma_ops = &xen_swiotlb_dma_ops; +#ifdef CONFIG_PCI /* Make sure ACS will be enabled */ pci_request_acs(); +#endif } } @@ -92,8 +94,10 @@ int pci_xen_swiotlb_init_late(void) return rc; dma_ops = &xen_swiotlb_dma_ops; +#ifdef CONFIG_PCI /* Make sure ACS will be enabled */ pci_request_acs(); +#endif return 0; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 09f3059cb00..68c054f59de 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -556,7 +556,7 @@ void xen_enable_syscall(void) } #endif /* CONFIG_X86_64 */ } -void __cpuinit xen_enable_nmi(void) +void xen_enable_nmi(void) { #ifdef CONFIG_X86_64 if (register_callback(CALLBACKTYPE_nmi, nmi)) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 31d04758b76..c36b325abd8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -149,7 +149,7 @@ static int xen_smp_intr_init(unsigned int cpu) rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, xen_reschedule_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING, resched_name, NULL); if (rc < 0) @@ -161,7 +161,7 @@ static int xen_smp_intr_init(unsigned int cpu) rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, xen_call_function_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING, callfunc_name, NULL); if (rc < 0) @@ -171,7 +171,7 @@ static int xen_smp_intr_init(unsigned int cpu) debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, - IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING, + IRQF_PERCPU | IRQF_NOBALANCING, debug_name, NULL); if (rc < 0) goto fail; @@ -182,7 +182,7 @@ static int xen_smp_intr_init(unsigned int cpu) rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, xen_call_function_single_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING, callfunc_name, NULL); if (rc < 0) @@ -201,7 +201,7 @@ static int xen_smp_intr_init(unsigned int cpu) rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, cpu, xen_irq_work_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING, callfunc_name, NULL); if (rc < 0) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index be6b8607895..0e36cde12f7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -234,7 +234,7 @@ void xen_init_lock_cpu(int cpu) irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, dummy_handler, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING, name, NULL); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index ee365895b06..12a1ca707b9 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -443,8 +443,7 @@ void xen_setup_timer(int cpu) name = "<timer kasprintf failed>"; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU| - IRQF_NOBALANCING|IRQF_TIMER| + IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| IRQF_FORCE_RESUME, name, NULL); diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index cf914c8c249..d38eb9237e6 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -38,35 +38,46 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } -/* Use a slab cache for the pte pages (see also sparc64 implementation) */ - -extern struct kmem_cache *pgtable_cache; - static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(pgtable_cache, GFP_KERNEL|__GFP_REPEAT); + pte_t *ptep; + int i; + + ptep = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + if (!ptep) + return NULL; + for (i = 0; i < 1024; i++) + pte_clear(NULL, 0, ptep + i); + return ptep; } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) { + pte_t *pte; struct page *page; - page = virt_to_page(pte_alloc_one_kernel(mm, addr)); - pgtable_page_ctor(page); + pte = pte_alloc_one_kernel(mm, addr); + if (!pte) + return NULL; + page = virt_to_page(pte); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } return page; } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - kmem_cache_free(pgtable_cache, pte); + free_page((unsigned long)pte); } static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { pgtable_page_dtor(pte); - kmem_cache_free(pgtable_cache, page_address(pte)); + __free_page(pte); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 0fdf5d043f8..216446295ad 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -220,12 +220,11 @@ extern unsigned long empty_zero_page[1024]; #ifdef CONFIG_MMU extern pgd_t swapper_pg_dir[PAGE_SIZE/sizeof(pgd_t)]; extern void paging_init(void); -extern void pgtable_cache_init(void); #else # define swapper_pg_dir NULL static inline void paging_init(void) { } -static inline void pgtable_cache_init(void) { } #endif +static inline void pgtable_cache_init(void) { } /* * The pmd contains the kernel virtual address of the pte page. diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index a1077570e38..c43771c974b 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -50,23 +50,3 @@ void __init init_mmu(void) */ set_ptevaddr_register(PGTABLE_START); } - -struct kmem_cache *pgtable_cache __read_mostly; - -static void pgd_ctor(void *addr) -{ - pte_t *ptep = (pte_t *)addr; - int i; - - for (i = 0; i < 1024; i++, ptep++) - pte_clear(NULL, 0, ptep); - -} - -void __init pgtable_cache_init(void) -{ - pgtable_cache = kmem_cache_create("pgd", - PAGE_SIZE, PAGE_SIZE, - SLAB_HWCACHE_ALIGN, - pgd_ctor); -} diff --git a/block/blk-mq.c b/block/blk-mq.c index 88d4e864d4c..c661896e246 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -319,7 +319,7 @@ void __blk_mq_end_io(struct request *rq, int error) blk_mq_complete_request(rq, error); } -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#if defined(CONFIG_SMP) /* * Called with interrupts disabled. @@ -361,7 +361,7 @@ static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, return true; } -#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +#else /* CONFIG_SMP */ static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, struct request *rq, const int error) { diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ce4b8bfd3d2..57790c1a97e 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -36,7 +36,7 @@ static void blk_done_softirq(struct softirq_action *h) } } -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP static void trigger_softirq(void *data) { struct request *rq = data; @@ -71,7 +71,7 @@ static int raise_blk_irq(int cpu, struct request *rq) return 1; } -#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +#else /* CONFIG_SMP */ static int raise_blk_irq(int cpu, struct request *rq) { return 1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4f8c4d90ec7..97779522472 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -288,7 +288,7 @@ static ssize_t queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) { ssize_t ret = -EINVAL; -#if defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP unsigned long val; ret = queue_var_store(&val, page, count); diff --git a/crypto/af_alg.c b/crypto/af_alg.c index ac33d5f3077..966f893711b 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -434,7 +434,7 @@ int af_alg_wait_for_completion(int err, struct af_alg_completion *completion) case -EINPROGRESS: case -EBUSY: wait_for_completion(&completion->completion); - INIT_COMPLETION(completion->completion); + reinit_completion(&completion->completion); err = completion->err; break; }; diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 25a5934f0e5..1ab8258fcf5 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -493,7 +493,7 @@ static inline int do_one_ahash_op(struct ahash_request *req, int ret) ret = wait_for_completion_interruptible(&tr->completion); if (!ret) ret = tr->err; - INIT_COMPLETION(tr->completion); + reinit_completion(&tr->completion); } return ret; } @@ -721,7 +721,7 @@ static inline int do_one_acipher_op(struct ablkcipher_request *req, int ret) ret = wait_for_completion_interruptible(&tr->completion); if (!ret) ret = tr->err; - INIT_COMPLETION(tr->completion); + reinit_completion(&tr->completion); } return ret; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index e091ef6e179..432afc03e7c 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -179,7 +179,7 @@ static int do_one_async_hash_op(struct ahash_request *req, ret = wait_for_completion_interruptible(&tr->completion); if (!ret) ret = tr->err; - INIT_COMPLETION(tr->completion); + reinit_completion(&tr->completion); } return ret; } @@ -336,7 +336,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, ret = wait_for_completion_interruptible( &tresult.completion); if (!ret && !(ret = tresult.err)) { - INIT_COMPLETION(tresult.completion); + reinit_completion(&tresult.completion); break; } /* fall through */ @@ -543,7 +543,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc, ret = wait_for_completion_interruptible( &result.completion); if (!ret && !(ret = result.err)) { - INIT_COMPLETION(result.completion); + reinit_completion(&result.completion); break; } case -EBADMSG: @@ -697,7 +697,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc, ret = wait_for_completion_interruptible( &result.completion); if (!ret && !(ret = result.err)) { - INIT_COMPLETION(result.completion); + reinit_completion(&result.completion); break; } case -EBADMSG: @@ -983,7 +983,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, ret = wait_for_completion_interruptible( &result.completion); if (!ret && !((ret = result.err))) { - INIT_COMPLETION(result.completion); + reinit_completion(&result.completion); break; } /* fall through */ @@ -1086,7 +1086,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, ret = wait_for_completion_interruptible( &result.completion); if (!ret && !((ret = result.err))) { - INIT_COMPLETION(result.completion); + reinit_completion(&result.completion); break; } /* fall through */ diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 77bbc826688..92d7797223b 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -3017,7 +3017,7 @@ static inline void ata_eh_pull_park_action(struct ata_port *ap) * ourselves at the beginning of each pass over the loop. * * Additionally, all write accesses to &ap->park_req_pending - * through INIT_COMPLETION() (see below) or complete_all() + * through reinit_completion() (see below) or complete_all() * (see ata_scsi_park_store()) are protected by the host lock. * As a result we have that park_req_pending.done is zero on * exit from this function, i.e. when ATA_EH_PARK actions for @@ -3031,7 +3031,7 @@ static inline void ata_eh_pull_park_action(struct ata_port *ap) */ spin_lock_irqsave(ap->lock, flags); - INIT_COMPLETION(ap->park_req_pending); + reinit_completion(&ap->park_req_pending); ata_for_each_link(link, ap, EDGE) { ata_for_each_dev(dev, link, ALL) { struct ata_eh_info *ehi = &link->eh_info; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index ee039afe907..c12e9b9556b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -757,7 +757,7 @@ void dpm_resume(pm_message_t state) async_error = 0; list_for_each_entry(dev, &dpm_suspended_list, power.entry) { - INIT_COMPLETION(dev->power.completion); + reinit_completion(&dev->power.completion); if (is_async(dev)) { get_device(dev); async_schedule(async_resume, dev); @@ -1237,7 +1237,7 @@ static void async_suspend(void *data, async_cookie_t cookie) static int device_suspend(struct device *dev) { - INIT_COMPLETION(dev->power.completion); + reinit_completion(&dev->power.completion); if (pm_async_enabled && dev->power.async_suspend) { get_device(dev); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 4ff85b8785e..748dea4f34d 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -343,7 +343,7 @@ static int fd_motor_on(int nr) unit[nr].motor = 1; fd_select(nr); - INIT_COMPLETION(motor_on_completion); + reinit_completion(&motor_on_completion); motor_on_timer.data = nr; mod_timer(&motor_on_timer, jiffies + HZ/2); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 0c004ac0581..b35fc4f5237 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -2808,7 +2808,7 @@ resend_cmd2: /* erase the old error information */ memset(c->err_info, 0, sizeof(ErrorInfo_struct)); return_status = IO_OK; - INIT_COMPLETION(wait); + reinit_completion(&wait); goto resend_cmd2; } @@ -3669,7 +3669,7 @@ static int add_to_scan_list(struct ctlr_info *h) } } if (!found && !h->busy_scanning) { - INIT_COMPLETION(h->scan_wait); + reinit_completion(&h->scan_wait); list_add_tail(&h->scan_list, &scan_q); ret = 1; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5cdf88b7ad9..f3be496ac8f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -292,6 +292,8 @@ static void virtblk_done(struct virtqueue *vq) req_done = true; } } + if (unlikely(virtqueue_is_broken(vq))) + break; } while (!virtqueue_enable_cb(vq)); /* In case queue is stopped waiting for more buffers. */ if (req_done) @@ -456,18 +458,15 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) { struct virtio_blk *vblk = bd->bd_disk->private_data; - struct virtio_blk_geometry vgeo; - int err; /* see if the host passed in geometry config */ - err = virtio_config_val(vblk->vdev, VIRTIO_BLK_F_GEOMETRY, - offsetof(struct virtio_blk_config, geometry), - &vgeo); - - if (!err) { - geo->heads = vgeo.heads; - geo->sectors = vgeo.sectors; - geo->cylinders = vgeo.cylinders; + if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) { + virtio_cread(vblk->vdev, struct virtio_blk_config, + geometry.cylinders, &geo->cylinders); + virtio_cread(vblk->vdev, struct virtio_blk_config, + geometry.heads, &geo->heads); + virtio_cread(vblk->vdev, struct virtio_blk_config, + geometry.sectors, &geo->sectors); } else { /* some standard values, similar to sd */ geo->heads = 1 << 6; @@ -529,8 +528,7 @@ static void virtblk_config_changed_work(struct work_struct *work) goto done; /* Host must always specify the capacity. */ - vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), - &capacity, sizeof(capacity)); + virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity); /* If capacity is too big, truncate with warning. */ if ((sector_t)capacity != capacity) { @@ -608,9 +606,9 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) u8 writeback; int err; - err = virtio_config_val(vdev, VIRTIO_BLK_F_CONFIG_WCE, - offsetof(struct virtio_blk_config, wce), - &writeback); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE, + struct virtio_blk_config, wce, + &writeback); if (err) writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE); @@ -642,7 +640,6 @@ virtblk_cache_type_store(struct device *dev, struct device_attribute *attr, struct virtio_blk *vblk = disk->private_data; struct virtio_device *vdev = vblk->vdev; int i; - u8 writeback; BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE)); for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; ) @@ -652,11 +649,7 @@ virtblk_cache_type_store(struct device *dev, struct device_attribute *attr, if (i < 0) return -EINVAL; - writeback = i; - vdev->config->set(vdev, - offsetof(struct virtio_blk_config, wce), - &writeback, sizeof(writeback)); - + virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i); virtblk_update_cache_mode(vdev); return count; } @@ -699,9 +692,9 @@ static int virtblk_probe(struct virtio_device *vdev) index = err; /* We need to know how many segments before we allocate. */ - err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX, - offsetof(struct virtio_blk_config, seg_max), - &sg_elems); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX, + struct virtio_blk_config, seg_max, + &sg_elems); /* We need at least one SG element, whatever they say. */ if (err || !sg_elems) @@ -772,8 +765,7 @@ static int virtblk_probe(struct virtio_device *vdev) set_disk_ro(vblk->disk, 1); /* Host must always specify the capacity. */ - vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), - &cap, sizeof(cap)); + virtio_cread(vdev, struct virtio_blk_config, capacity, &cap); /* If capacity is too big, truncate with warning. */ if ((sector_t)cap != cap) { @@ -794,46 +786,45 @@ static int virtblk_probe(struct virtio_device *vdev) /* Host can optionally specify maximum segment size and number of * segments. */ - err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX, - offsetof(struct virtio_blk_config, size_max), - &v); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX, + struct virtio_blk_config, size_max, &v); if (!err) blk_queue_max_segment_size(q, v); else blk_queue_max_segment_size(q, -1U); /* Host can optionally specify the block size of the device */ - err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE, - offsetof(struct virtio_blk_config, blk_size), - &blk_size); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, + struct virtio_blk_config, blk_size, + &blk_size); if (!err) blk_queue_logical_block_size(q, blk_size); else blk_size = queue_logical_block_size(q); /* Use topology information if available */ - err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, - offsetof(struct virtio_blk_config, physical_block_exp), - &physical_block_exp); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, physical_block_exp, + &physical_block_exp); if (!err && physical_block_exp) blk_queue_physical_block_size(q, blk_size * (1 << physical_block_exp)); - err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, - offsetof(struct virtio_blk_config, alignment_offset), - &alignment_offset); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, alignment_offset, + &alignment_offset); if (!err && alignment_offset) blk_queue_alignment_offset(q, blk_size * alignment_offset); - err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, - offsetof(struct virtio_blk_config, min_io_size), - &min_io_size); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, min_io_size, + &min_io_size); if (!err && min_io_size) blk_queue_io_min(q, blk_size * min_io_size); - err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, - offsetof(struct virtio_blk_config, opt_io_size), - &opt_io_size); + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, opt_io_size, + &opt_io_size); if (!err && opt_io_size) blk_queue_io_opt(q, blk_size * opt_io_size); @@ -899,7 +890,7 @@ static void virtblk_remove(struct virtio_device *vdev) ida_simple_remove(&vd_index_ida, index); } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int virtblk_freeze(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; @@ -959,7 +950,7 @@ static struct virtio_driver virtio_blk = { .probe = virtblk_probe, .remove = virtblk_remove, .config_changed = virtblk_config_changed, -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP .freeze = virtblk_freeze, .restore = virtblk_restore, #endif diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index d2120ba8f3f..73ce739f8e1 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -79,7 +79,7 @@ static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data) priv->expires = cur + delay; priv->present = 0; - INIT_COMPLETION(priv->completion); + reinit_completion(&priv->completion); mod_timer(&priv->timer, priv->expires); return 4; diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index ef46a9cfd83..c12398d1517 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -133,7 +133,7 @@ static void virtrng_remove(struct virtio_device *vdev) remove_common(vdev); } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int virtrng_freeze(struct virtio_device *vdev) { remove_common(vdev); @@ -157,7 +157,7 @@ static struct virtio_driver virtio_rng_driver = { .id_table = id_table, .probe = virtrng_probe, .remove = virtrng_remove, -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP .freeze = virtrng_freeze, .restore = virtrng_restore, #endif diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index b79cf3e1b79..feea87cc6b8 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -577,7 +577,8 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, spin_lock(&portdev->c_ovq_lock); if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { virtqueue_kick(vq); - while (!virtqueue_get_buf(vq, &len)) + while (!virtqueue_get_buf(vq, &len) + && !virtqueue_is_broken(vq)) cpu_relax(); } spin_unlock(&portdev->c_ovq_lock); @@ -650,7 +651,8 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, * we need to kmalloc a GFP_ATOMIC buffer each time the * console driver writes something out. */ - while (!virtqueue_get_buf(out_vq, &len)) + while (!virtqueue_get_buf(out_vq, &len) + && !virtqueue_is_broken(out_vq)) cpu_relax(); done: spin_unlock_irqrestore(&port->outvq_lock, flags); @@ -1837,12 +1839,8 @@ static void config_intr(struct virtio_device *vdev) struct port *port; u16 rows, cols; - vdev->config->get(vdev, - offsetof(struct virtio_console_config, cols), - &cols, sizeof(u16)); - vdev->config->get(vdev, - offsetof(struct virtio_console_config, rows), - &rows, sizeof(u16)); + virtio_cread(vdev, struct virtio_console_config, cols, &cols); + virtio_cread(vdev, struct virtio_console_config, rows, &rows); port = find_port_by_id(portdev, 0); set_console_size(port, rows, cols); @@ -2014,10 +2012,9 @@ static int virtcons_probe(struct virtio_device *vdev) /* Don't test MULTIPORT at all if we're rproc: not a valid feature! */ if (!is_rproc_serial(vdev) && - virtio_config_val(vdev, VIRTIO_CONSOLE_F_MULTIPORT, - offsetof(struct virtio_console_config, - max_nr_ports), - &portdev->config.max_nr_ports) == 0) { + virtio_cread_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT, + struct virtio_console_config, max_nr_ports, + &portdev->config.max_nr_ports) == 0) { multiport = true; } @@ -2142,7 +2139,7 @@ static struct virtio_device_id rproc_serial_id_table[] = { static unsigned int rproc_serial_features[] = { }; -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int virtcons_freeze(struct virtio_device *vdev) { struct ports_device *portdev; @@ -2220,7 +2217,7 @@ static struct virtio_driver virtio_console = { .probe = virtcons_probe, .remove = virtcons_remove, .config_changed = config_intr, -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP .freeze = virtcons_freeze, .restore = virtcons_restore, #endif diff --git a/drivers/crypto/tegra-aes.c b/drivers/crypto/tegra-aes.c index 2d58da972ae..fa05e3c329b 100644 --- a/drivers/crypto/tegra-aes.c +++ b/drivers/crypto/tegra-aes.c @@ -268,7 +268,7 @@ static int aes_start_crypt(struct tegra_aes_dev *dd, u32 in_addr, u32 out_addr, aes_writel(dd, value, TEGRA_AES_SECURE_INPUT_SELECT); aes_writel(dd, out_addr, TEGRA_AES_SECURE_DEST_ADDR); - INIT_COMPLETION(dd->op_complete); + reinit_completion(&dd->op_complete); for (i = 0; i < AES_HW_MAX_ICQ_LENGTH - 1; i++) { do { diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index e5af0e3a26e..0e799516a2a 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -477,7 +477,7 @@ void fw_send_phy_config(struct fw_card *card, phy_config_packet.header[1] = data; phy_config_packet.header[2] = ~data; phy_config_packet.generation = generation; - INIT_COMPLETION(phy_config_done); + reinit_completion(&phy_config_done); card->driver->send_request(card, &phy_config_packet); wait_for_completion_timeout(&phy_config_done, timeout); diff --git a/drivers/gpu/drm/drm_flip_work.c b/drivers/gpu/drm/drm_flip_work.c index e788882d902..f9c7fa3d001 100644 --- a/drivers/gpu/drm/drm_flip_work.c +++ b/drivers/gpu/drm/drm_flip_work.c @@ -34,7 +34,7 @@ */ void drm_flip_work_queue(struct drm_flip_work *work, void *val) { - if (kfifo_put(&work->fifo, (const void **)&val)) { + if (kfifo_put(&work->fifo, val)) { atomic_inc(&work->pending); } else { DRM_ERROR("%s fifo full!\n", work->name); diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c index 1eb86c79523..e2810706114 100644 --- a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c +++ b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c @@ -99,7 +99,7 @@ static int xfer_read(struct i2c_adapter *adap, struct i2c_msg *pmsg) i2c_dev->status = I2C_STAT_INIT; i2c_dev->msg = pmsg; i2c_dev->buf_offset = 0; - INIT_COMPLETION(i2c_dev->complete); + reinit_completion(&i2c_dev->complete); /* Enable I2C transaction */ temp = ((pmsg->len) << 20) | HI2C_EDID_READ | HI2C_ENABLE_TRANSACTION; diff --git a/drivers/hid/hid-wiimote.h b/drivers/hid/hid-wiimote.h index 75db0c40003..cfa63b0825b 100644 --- a/drivers/hid/hid-wiimote.h +++ b/drivers/hid/hid-wiimote.h @@ -327,7 +327,7 @@ static inline void wiimote_cmd_acquire_noint(struct wiimote_data *wdata) static inline void wiimote_cmd_set(struct wiimote_data *wdata, int cmd, __u32 opt) { - INIT_COMPLETION(wdata->state.ready); + reinit_completion(&wdata->state.ready); wdata->state.cmd = cmd; wdata->state.opt = opt; } diff --git a/drivers/hwmon/jz4740-hwmon.c b/drivers/hwmon/jz4740-hwmon.c index e0d66b9590a..a183e488db7 100644 --- a/drivers/hwmon/jz4740-hwmon.c +++ b/drivers/hwmon/jz4740-hwmon.c @@ -66,7 +66,7 @@ static ssize_t jz4740_hwmon_read_adcin(struct device *dev, mutex_lock(&hwmon->lock); - INIT_COMPLETION(*completion); + reinit_completion(completion); enable_irq(hwmon->irq); hwmon->cell->enable(to_platform_device(dev)); diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c index fd059308aff..8edba9de76d 100644 --- a/drivers/i2c/busses/i2c-at91.c +++ b/drivers/i2c/busses/i2c-at91.c @@ -371,7 +371,7 @@ static int at91_do_twi_transfer(struct at91_twi_dev *dev) dev_dbg(dev->dev, "transfer: %s %d bytes.\n", (dev->msg->flags & I2C_M_RD) ? "read" : "write", dev->buf_len); - INIT_COMPLETION(dev->cmd_complete); + reinit_completion(&dev->cmd_complete); dev->transfer_status = 0; if (!dev->buf_len) { diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index ea4b08fc335..d7e8600f31f 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -151,7 +151,7 @@ static int bcm2835_i2c_xfer_msg(struct bcm2835_i2c_dev *i2c_dev, i2c_dev->msg_buf = msg->buf; i2c_dev->msg_buf_remaining = msg->len; - INIT_COMPLETION(i2c_dev->completion); + reinit_completion(&i2c_dev->completion); bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, BCM2835_I2C_C_CLEAR); diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c index 132369fad4e..960dec61c64 100644 --- a/drivers/i2c/busses/i2c-davinci.c +++ b/drivers/i2c/busses/i2c-davinci.c @@ -323,7 +323,7 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop) davinci_i2c_write_reg(dev, DAVINCI_I2C_CNT_REG, dev->buf_len); - INIT_COMPLETION(dev->cmd_complete); + reinit_completion(&dev->cmd_complete); dev->cmd_err = 0; /* Take I2C out of reset and configure it as master */ diff --git a/drivers/i2c/busses/i2c-designware-core.c b/drivers/i2c/busses/i2c-designware-core.c index 5888feef1ac..e89e3e2145e 100644 --- a/drivers/i2c/busses/i2c-designware-core.c +++ b/drivers/i2c/busses/i2c-designware-core.c @@ -613,7 +613,7 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num) mutex_lock(&dev->lock); pm_runtime_get_sync(dev->dev); - INIT_COMPLETION(dev->cmd_complete); + reinit_completion(&dev->cmd_complete); dev->msgs = msgs; dev->msgs_num = num; dev->cmd_err = 0; diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index 1672effbceb..0043ede234c 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -541,7 +541,7 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr, desc->dptr_high = upper_32_bits(dma_addr); } - INIT_COMPLETION(priv->cmp); + reinit_completion(&priv->cmp); /* Add the descriptor */ ismt_submit_desc(priv); diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index b7c85777470..3aedd86a646 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -505,7 +505,7 @@ static int mxs_i2c_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, return err; } } else { - INIT_COMPLETION(i2c->cmd_complete); + reinit_completion(&i2c->cmd_complete); ret = mxs_i2c_dma_setup_xfer(adap, msg, flags); if (ret) return ret; diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c index 9967a6f9c2f..a6a891d7970 100644 --- a/drivers/i2c/busses/i2c-omap.c +++ b/drivers/i2c/busses/i2c-omap.c @@ -543,7 +543,7 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap, w |= OMAP_I2C_BUF_RXFIF_CLR | OMAP_I2C_BUF_TXFIF_CLR; omap_i2c_write_reg(dev, OMAP_I2C_BUF_REG, w); - INIT_COMPLETION(dev->cmd_complete); + reinit_completion(&dev->cmd_complete); dev->cmd_err = 0; w = OMAP_I2C_CON_EN | OMAP_I2C_CON_MST | OMAP_I2C_CON_STT; diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index c457cb447c6..e661edee4d0 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -544,7 +544,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev, i2c_dev->msg_buf_remaining = msg->len; i2c_dev->msg_err = I2C_ERR_NONE; i2c_dev->msg_read = (msg->flags & I2C_M_RD); - INIT_COMPLETION(i2c_dev->msg_complete); + reinit_completion(&i2c_dev->msg_complete); packet_header = (0 << PACKET_HEADER0_HEADER_SIZE_SHIFT) | PACKET_HEADER0_PROTOCOL_I2C | diff --git a/drivers/i2c/busses/i2c-wmt.c b/drivers/i2c/busses/i2c-wmt.c index c65da3d913a..31395fa8121 100644 --- a/drivers/i2c/busses/i2c-wmt.c +++ b/drivers/i2c/busses/i2c-wmt.c @@ -158,7 +158,7 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, writew(val, i2c_dev->base + REG_CR); } - INIT_COMPLETION(i2c_dev->complete); + reinit_completion(&i2c_dev->complete); if (i2c_dev->mode == I2C_MODE_STANDARD) tcr_val = TCR_STANDARD_MODE; @@ -247,7 +247,7 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg, writew(val, i2c_dev->base + REG_CR); } - INIT_COMPLETION(i2c_dev->complete); + reinit_completion(&i2c_dev->complete); if (i2c_dev->mode == I2C_MODE_STANDARD) tcr_val = TCR_STANDARD_MODE; diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index e6fbd3e7098..9a4e0e32a77 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -188,7 +188,7 @@ static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta, spi_bus_lock(sigma_delta->spi->master); sigma_delta->bus_locked = true; - INIT_COMPLETION(sigma_delta->completion); + reinit_completion(&sigma_delta->completion); ret = ad_sigma_delta_set_mode(sigma_delta, mode); if (ret < 0) @@ -259,7 +259,7 @@ int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, spi_bus_lock(sigma_delta->spi->master); sigma_delta->bus_locked = true; - INIT_COMPLETION(sigma_delta->completion); + reinit_completion(&sigma_delta->completion); ad_sigma_delta_set_mode(sigma_delta, AD_SD_MODE_SINGLE); @@ -343,7 +343,7 @@ static int ad_sd_buffer_postdisable(struct iio_dev *indio_dev) { struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev); - INIT_COMPLETION(sigma_delta->completion); + reinit_completion(&sigma_delta->completion); wait_for_completion_timeout(&sigma_delta->completion, HZ); if (!sigma_delta->irq_dis) { diff --git a/drivers/iio/adc/nau7802.c b/drivers/iio/adc/nau7802.c index 54c5babe674..e525aa6475c 100644 --- a/drivers/iio/adc/nau7802.c +++ b/drivers/iio/adc/nau7802.c @@ -190,7 +190,7 @@ static int nau7802_read_irq(struct iio_dev *indio_dev, struct nau7802_state *st = iio_priv(indio_dev); int ret; - INIT_COMPLETION(st->value_ok); + reinit_completion(&st->value_ok); enable_irq(st->client->irq); nau7802_sync(st); diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c index dac15b9f9df..c10eab64bc0 100644 --- a/drivers/iio/industrialio-event.c +++ b/drivers/iio/industrialio-event.c @@ -56,7 +56,7 @@ int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp) ev.id = ev_code; ev.timestamp = timestamp; - copied = kfifo_put(&ev_int->det_events, &ev); + copied = kfifo_put(&ev_int->det_events, ev); if (copied != 0) wake_up_locked_poll(&ev_int->wait, POLLIN); } diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index d53e0b72a40..4204841cdc4 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -242,7 +242,7 @@ static int cyttsp_soft_reset(struct cyttsp *ts) int retval; /* wait for interrupt to set ready completion */ - INIT_COMPLETION(ts->bl_ready); + reinit_completion(&ts->bl_ready); ts->state = CY_BL_STATE; enable_irq(ts->irq); diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c880ebaf155..9fd51e51e78 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -206,7 +206,7 @@ config SHMOBILE_IPMMU_TLB config SHMOBILE_IOMMU bool "IOMMU for Renesas IPMMU/IPMMUI" default n - depends on (ARM && ARCH_SHMOBILE) + depends on ARM || COMPILE_TEST select IOMMU_API select ARM_DMA_USE_IOMMU select SHMOBILE_IPMMU diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 14c1f474cf1..5d58bf16e9e 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_IOMMU_API) += iommu.o +obj-$(CONFIG_IOMMU_API) += iommu-traces.o obj-$(CONFIG_OF_IOMMU) += of_iommu.o obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 181c9ba929c..1abfb5684ab 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -590,6 +590,9 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev) ret = IRQ_HANDLED; resume = RESUME_RETRY; } else { + dev_err_ratelimited(smmu->dev, + "Unhandled context fault: iova=0x%08lx, fsynr=0x%x, cb=%d\n", + iova, fsynr, root_cfg->cbndx); ret = IRQ_NONE; resume = RESUME_TERMINATE; } @@ -778,7 +781,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain) #ifdef __BIG_ENDIAN reg |= SCTLR_E; #endif - writel(reg, cb_base + ARM_SMMU_CB_SCTLR); + writel_relaxed(reg, cb_base + ARM_SMMU_CB_SCTLR); } static int arm_smmu_init_domain_context(struct iommu_domain *domain, @@ -1212,7 +1215,10 @@ static int arm_smmu_alloc_init_pte(struct arm_smmu_device *smmu, pmd_t *pmd, arm_smmu_flush_pgtable(smmu, page_address(table), ARM_SMMU_PTE_HWTABLE_SIZE); - pgtable_page_ctor(table); + if (!pgtable_page_ctor(table)) { + __free_page(table); + return -ENOMEM; + } pmd_populate(NULL, pmd, table); arm_smmu_flush_pgtable(smmu, pmd, sizeof(*pmd)); } @@ -1559,9 +1565,13 @@ static struct iommu_ops arm_smmu_ops = { static void arm_smmu_device_reset(struct arm_smmu_device *smmu) { void __iomem *gr0_base = ARM_SMMU_GR0(smmu); - void __iomem *sctlr_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB_SCTLR; + void __iomem *cb_base; int i = 0; - u32 scr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sCR0); + u32 reg; + + /* Clear Global FSR */ + reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR); + writel(reg, gr0_base + ARM_SMMU_GR0_sGFSR); /* Mark all SMRn as invalid and all S2CRn as bypass */ for (i = 0; i < smmu->num_mapping_groups; ++i) { @@ -1569,33 +1579,38 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu) writel_relaxed(S2CR_TYPE_BYPASS, gr0_base + ARM_SMMU_GR0_S2CR(i)); } - /* Make sure all context banks are disabled */ - for (i = 0; i < smmu->num_context_banks; ++i) - writel_relaxed(0, sctlr_base + ARM_SMMU_CB(smmu, i)); + /* Make sure all context banks are disabled and clear CB_FSR */ + for (i = 0; i < smmu->num_context_banks; ++i) { + cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i); + writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR); + writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR); + } /* Invalidate the TLB, just in case */ writel_relaxed(0, gr0_base + ARM_SMMU_GR0_STLBIALL); writel_relaxed(0, gr0_base + ARM_SMMU_GR0_TLBIALLH); writel_relaxed(0, gr0_base + ARM_SMMU_GR0_TLBIALLNSNH); + reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_sCR0); + /* Enable fault reporting */ - scr0 |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE); + reg |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE); /* Disable TLB broadcasting. */ - scr0 |= (sCR0_VMIDPNE | sCR0_PTM); + reg |= (sCR0_VMIDPNE | sCR0_PTM); /* Enable client access, but bypass when no mapping is found */ - scr0 &= ~(sCR0_CLIENTPD | sCR0_USFCFG); + reg &= ~(sCR0_CLIENTPD | sCR0_USFCFG); /* Disable forced broadcasting */ - scr0 &= ~sCR0_FB; + reg &= ~sCR0_FB; /* Don't upgrade barriers */ - scr0 &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT); + reg &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT); /* Push the button */ arm_smmu_tlb_sync(smmu); - writel(scr0, gr0_base + ARM_SMMU_GR0_sCR0); + writel_relaxed(reg, gr0_base + ARM_SMMU_GR0_sCR0); } static int arm_smmu_id_size_to_bits(int size) @@ -1700,13 +1715,12 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu) id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID1); smmu->pagesize = (id & ID1_PAGESIZE) ? SZ_64K : SZ_4K; - /* Check that we ioremapped enough */ + /* Check for size mismatch of SMMU address space from mapped region */ size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1); size *= (smmu->pagesize << 1); - if (smmu->size < size) - dev_warn(smmu->dev, - "device is 0x%lx bytes but only mapped 0x%lx!\n", - size, smmu->size); + if (smmu->size != size) + dev_warn(smmu->dev, "SMMU address space size (0x%lx) differs " + "from mapped region size (0x%lx)!\n", size, smmu->size); smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK; @@ -1781,15 +1795,10 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev) smmu->dev = dev; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(dev, "missing base address/size\n"); - return -ENODEV; - } - + smmu->base = devm_ioremap_resource(dev, res); + if (IS_ERR(smmu->base)) + return PTR_ERR(smmu->base); smmu->size = resource_size(res); - smmu->base = devm_request_and_ioremap(dev, res); - if (!smmu->base) - return -EADDRNOTAVAIL; if (of_property_read_u32(dev->of_node, "#global-interrupts", &smmu->num_global_irqs)) { @@ -1804,12 +1813,11 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev) smmu->num_context_irqs++; } - if (num_irqs < smmu->num_global_irqs) { - dev_warn(dev, "found %d interrupts but expected at least %d\n", - num_irqs, smmu->num_global_irqs); - smmu->num_global_irqs = num_irqs; + if (!smmu->num_context_irqs) { + dev_err(dev, "found %d interrupts but expected at least %d\n", + num_irqs, smmu->num_global_irqs + 1); + return -ENODEV; } - smmu->num_context_irqs = num_irqs - smmu->num_global_irqs; smmu->irqs = devm_kzalloc(dev, sizeof(*smmu->irqs) * num_irqs, GFP_KERNEL); @@ -1933,7 +1941,7 @@ static int arm_smmu_device_remove(struct platform_device *pdev) free_irq(smmu->irqs[i], smmu); /* Turn the thing off */ - writel(sCR0_CLIENTPD, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_sCR0); + writel_relaxed(sCR0_CLIENTPD, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_sCR0); return 0; } @@ -1981,7 +1989,7 @@ static void __exit arm_smmu_exit(void) return platform_driver_unregister(&arm_smmu_driver); } -module_init(arm_smmu_init); +subsys_initcall(arm_smmu_init); module_exit(arm_smmu_exit); MODULE_DESCRIPTION("IOMMU API for ARM architected SMMU implementations"); diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 90094695023..8b452c9676d 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -403,7 +403,7 @@ dmar_find_matched_drhd_unit(struct pci_dev *dev) dev = pci_physfn(dev); - list_for_each_entry(dmaru, &dmar_drhd_units, list) { + for_each_drhd_unit(dmaru) { drhd = container_of(dmaru->hdr, struct acpi_dmar_hardware_unit, header); diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 15e9b57e9cf..43b9bfea48f 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -782,7 +782,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, int offset; BUG_ON(!domain->pgd); - BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); + + if (addr_width < BITS_PER_LONG && pfn >> addr_width) + /* Address beyond IOMMU's addressing capabilities. */ + return NULL; + parent = domain->pgd; while (level > 0) { @@ -3777,11 +3781,10 @@ static void iommu_detach_dependent_devices(struct intel_iommu *iommu, static void domain_remove_one_dev_info(struct dmar_domain *domain, struct pci_dev *pdev) { - struct device_domain_info *info; + struct device_domain_info *info, *tmp; struct intel_iommu *iommu; unsigned long flags; int found = 0; - struct list_head *entry, *tmp; iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number, pdev->devfn); @@ -3789,8 +3792,7 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain, return; spin_lock_irqsave(&device_domain_lock, flags); - list_for_each_safe(entry, tmp, &domain->devices) { - info = list_entry(entry, struct device_domain_info, link); + list_for_each_entry_safe(info, tmp, &domain->devices, link) { if (info->segment == pci_domain_nr(pdev->bus) && info->bus == pdev->bus->number && info->devfn == pdev->devfn) { diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index ab86902fd9f..bab10b1002f 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -525,12 +525,13 @@ static int __init intel_irq_remapping_supported(void) if (disable_irq_remap) return 0; if (irq_remap_broken) { - WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND, - "This system BIOS has enabled interrupt remapping\n" - "on a chipset that contains an erratum making that\n" - "feature unstable. To maintain system stability\n" - "interrupt remapping is being disabled. Please\n" - "contact your BIOS vendor for an update\n"); + printk(KERN_WARNING + "This system BIOS has enabled interrupt remapping\n" + "on a chipset that contains an erratum making that\n" + "feature unstable. To maintain system stability\n" + "interrupt remapping is being disabled. Please\n" + "contact your BIOS vendor for an update\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); disable_irq_remap = 1; return 0; } diff --git a/drivers/iommu/iommu-traces.c b/drivers/iommu/iommu-traces.c new file mode 100644 index 00000000000..bf3b317ff0c --- /dev/null +++ b/drivers/iommu/iommu-traces.c @@ -0,0 +1,27 @@ +/* + * iommu trace points + * + * Copyright (C) 2013 Shuah Khan <shuah.kh@samsung.com> + * + */ + +#include <linux/string.h> +#include <linux/types.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/iommu.h> + +/* iommu_group_event */ +EXPORT_TRACEPOINT_SYMBOL_GPL(add_device_to_group); +EXPORT_TRACEPOINT_SYMBOL_GPL(remove_device_from_group); + +/* iommu_device_event */ +EXPORT_TRACEPOINT_SYMBOL_GPL(attach_device_to_domain); +EXPORT_TRACEPOINT_SYMBOL_GPL(detach_device_from_domain); + +/* iommu_map_unmap */ +EXPORT_TRACEPOINT_SYMBOL_GPL(map); +EXPORT_TRACEPOINT_SYMBOL_GPL(unmap); + +/* iommu_error */ +EXPORT_TRACEPOINT_SYMBOL_GPL(io_page_fault); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index fbe9ca734f8..e5555fcfe70 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -29,6 +29,7 @@ #include <linux/idr.h> #include <linux/notifier.h> #include <linux/err.h> +#include <trace/events/iommu.h> static struct kset *iommu_group_kset; static struct ida iommu_group_ida; @@ -363,6 +364,8 @@ rename: /* Notify any listeners about change to group. */ blocking_notifier_call_chain(&group->notifier, IOMMU_GROUP_NOTIFY_ADD_DEVICE, dev); + + trace_add_device_to_group(group->id, dev); return 0; } EXPORT_SYMBOL_GPL(iommu_group_add_device); @@ -399,6 +402,8 @@ void iommu_group_remove_device(struct device *dev) sysfs_remove_link(group->devices_kobj, device->name); sysfs_remove_link(&dev->kobj, "iommu_group"); + trace_remove_device_from_group(group->id, dev); + kfree(device->name); kfree(device); dev->iommu_group = NULL; @@ -680,10 +685,14 @@ EXPORT_SYMBOL_GPL(iommu_domain_free); int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { + int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; - return domain->ops->attach_dev(domain, dev); + ret = domain->ops->attach_dev(domain, dev); + if (!ret) + trace_attach_device_to_domain(dev); + return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device); @@ -693,6 +702,7 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev) return; domain->ops->detach_dev(domain, dev); + trace_detach_device_from_domain(dev); } EXPORT_SYMBOL_GPL(iommu_detach_device); @@ -807,17 +817,17 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, * size of the smallest page supported by the hardware */ if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { - pr_err("unaligned: iova 0x%lx pa 0x%pa size 0x%zx min_pagesz 0x%x\n", + pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n", iova, &paddr, size, min_pagesz); return -EINVAL; } - pr_debug("map: iova 0x%lx pa 0x%pa size 0x%zx\n", iova, &paddr, size); + pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { size_t pgsize = iommu_pgsize(domain, iova | paddr, size); - pr_debug("mapping: iova 0x%lx pa 0x%pa pgsize 0x%zx\n", + pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n", iova, &paddr, pgsize); ret = domain->ops->map(domain, iova, paddr, pgsize, prot); @@ -832,6 +842,8 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, /* unroll mapping in case something went wrong */ if (ret) iommu_unmap(domain, orig_iova, orig_size - size); + else + trace_map(iova, paddr, size); return ret; } @@ -880,6 +892,7 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) unmapped += unmapped_page; } + trace_unmap(iova, 0, size); return unmapped; } EXPORT_SYMBOL_GPL(iommu_unmap); diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index 108c0e9c24d..dba1a9fd507 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -252,7 +252,7 @@ static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, spin_lock_irqsave(&gart->pte_lock, flags); pfn = __phys_to_pfn(pa); if (!pfn_valid(pfn)) { - dev_err(gart->dev, "Invalid page: %08x\n", pa); + dev_err(gart->dev, "Invalid page: %pa\n", &pa); spin_unlock_irqrestore(&gart->pte_lock, flags); return -EINVAL; } @@ -295,8 +295,8 @@ static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain, pa = (pte & GART_PAGE_MASK); if (!pfn_valid(__phys_to_pfn(pa))) { - dev_err(gart->dev, "No entry for %08llx:%08x\n", - (unsigned long long)iova, pa); + dev_err(gart->dev, "No entry for %08llx:%pa\n", + (unsigned long long)iova, &pa); gart_dump_table(gart); return -EINVAL; } @@ -351,7 +351,6 @@ static int tegra_gart_probe(struct platform_device *pdev) struct gart_device *gart; struct resource *res, *res_remap; void __iomem *gart_regs; - int err; struct device *dev = &pdev->dev; if (gart_handle) @@ -376,8 +375,7 @@ static int tegra_gart_probe(struct platform_device *pdev) gart_regs = devm_ioremap(dev, res->start, resource_size(res)); if (!gart_regs) { dev_err(dev, "failed to remap GART registers\n"); - err = -ENXIO; - goto fail; + return -ENXIO; } gart->dev = &pdev->dev; @@ -391,8 +389,7 @@ static int tegra_gart_probe(struct platform_device *pdev) gart->savedata = vmalloc(sizeof(u32) * gart->page_count); if (!gart->savedata) { dev_err(dev, "failed to allocate context save area\n"); - err = -ENOMEM; - goto fail; + return -ENOMEM; } platform_set_drvdata(pdev, gart); @@ -401,32 +398,20 @@ static int tegra_gart_probe(struct platform_device *pdev) gart_handle = gart; bus_set_iommu(&platform_bus_type, &gart_iommu_ops); return 0; - -fail: - if (gart_regs) - devm_iounmap(dev, gart_regs); - if (gart && gart->savedata) - vfree(gart->savedata); - devm_kfree(dev, gart); - return err; } static int tegra_gart_remove(struct platform_device *pdev) { struct gart_device *gart = platform_get_drvdata(pdev); - struct device *dev = gart->dev; writel(0, gart->regs + GART_CONFIG); if (gart->savedata) vfree(gart->savedata); - if (gart->regs) - devm_iounmap(dev, gart->regs); - devm_kfree(dev, gart); gart_handle = NULL; return 0; } -const struct dev_pm_ops tegra_gart_pm_ops = { +static const struct dev_pm_ops tegra_gart_pm_ops = { .suspend = tegra_gart_suspend, .resume = tegra_gart_resume, }; diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e0665603afd..605b5b46a90 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -731,7 +731,7 @@ static int smmu_iommu_map(struct iommu_domain *domain, unsigned long iova, unsigned long pfn = __phys_to_pfn(pa); unsigned long flags; - dev_dbg(as->smmu->dev, "[%d] %08lx:%08x\n", as->asid, iova, pa); + dev_dbg(as->smmu->dev, "[%d] %08lx:%pa\n", as->asid, iova, &pa); if (!pfn_valid(pfn)) return -ENOMEM; @@ -1254,7 +1254,7 @@ static int tegra_smmu_remove(struct platform_device *pdev) return 0; } -const struct dev_pm_ops tegra_smmu_pm_ops = { +static const struct dev_pm_ops tegra_smmu_pm_ops = { .suspend = tegra_smmu_suspend, .resume = tegra_smmu_resume, }; diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index b3256ff0d42..d0a1d8a45c8 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -229,7 +229,7 @@ struct lguest_vq_info { * make a hypercall. We hand the physical address of the virtqueue so the Host * knows which virtqueue we're talking about. */ -static void lg_notify(struct virtqueue *vq) +static bool lg_notify(struct virtqueue *vq) { /* * We store our virtqueue information in the "priv" pointer of the @@ -238,6 +238,7 @@ static void lg_notify(struct virtqueue *vq) struct lguest_vq_info *lvq = vq->priv; hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0); + return true; } /* An extern declaration inside a C file is bad form. Don't do it. */ diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 51692392633..922a1acbf65 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -157,7 +157,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * stack, then the address of this call. This stack layout happens to * exactly match the stack layout created by an interrupt... */ - asm volatile("pushf; lcall *lguest_entry" + asm volatile("pushf; lcall *%4" /* * This is how we tell GCC that %eax ("a") and %ebx ("b") * are changed by this routine. The "=" means output. @@ -169,7 +169,9 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * physical address of the Guest's top-level page * directory. */ - : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) + : "0"(pages), + "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)), + "m"(lguest_entry) /* * We tell gcc that all these registers could change, * which means we don't have to save and restore them in diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 50ea7ed24dc..81b0fa66045 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -950,7 +950,7 @@ static int crypt_convert(struct crypt_config *cc, /* async */ case -EBUSY: wait_for_completion(&ctx->restart); - INIT_COMPLETION(ctx->restart); + reinit_completion(&ctx->restart); /* fall through*/ case -EINPROGRESS: this_cc->req = NULL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f8b90684392..7f0e17a27ae 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -293,20 +293,6 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) do_release_stripe(conf, sh); } -static struct llist_node *llist_reverse_order(struct llist_node *head) -{ - struct llist_node *new_head = NULL; - - while (head) { - struct llist_node *tmp = head; - head = head->next; - tmp->next = new_head; - new_head = tmp; - } - - return new_head; -} - /* should hold conf->device_lock already */ static int release_stripe_list(struct r5conf *conf) { diff --git a/drivers/media/platform/blackfin/bfin_capture.c b/drivers/media/platform/blackfin/bfin_capture.c index 4c110597709..28191659143 100644 --- a/drivers/media/platform/blackfin/bfin_capture.c +++ b/drivers/media/platform/blackfin/bfin_capture.c @@ -422,7 +422,7 @@ static int bcap_start_streaming(struct vb2_queue *vq, unsigned int count) return ret; } - INIT_COMPLETION(bcap_dev->comp); + reinit_completion(&bcap_dev->comp); bcap_dev->stop = false; return 0; } diff --git a/drivers/media/radio/radio-wl1273.c b/drivers/media/radio/radio-wl1273.c index 97c2c18803e..9cf6731fb81 100644 --- a/drivers/media/radio/radio-wl1273.c +++ b/drivers/media/radio/radio-wl1273.c @@ -375,7 +375,7 @@ static int wl1273_fm_set_tx_freq(struct wl1273_device *radio, unsigned int freq) if (r) return r; - INIT_COMPLETION(radio->busy); + reinit_completion(&radio->busy); /* wait for the FR IRQ */ r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(2000)); @@ -389,7 +389,7 @@ static int wl1273_fm_set_tx_freq(struct wl1273_device *radio, unsigned int freq) if (r) return r; - INIT_COMPLETION(radio->busy); + reinit_completion(&radio->busy); /* wait for the POWER_ENB IRQ */ r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(1000)); @@ -444,7 +444,7 @@ static int wl1273_fm_set_rx_freq(struct wl1273_device *radio, unsigned int freq) goto err; } - INIT_COMPLETION(radio->busy); + reinit_completion(&radio->busy); r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(2000)); if (!r) { @@ -805,7 +805,7 @@ static int wl1273_fm_set_seek(struct wl1273_device *radio, if (level < SCHAR_MIN || level > SCHAR_MAX) return -EINVAL; - INIT_COMPLETION(radio->busy); + reinit_completion(&radio->busy); dev_dbg(radio->dev, "%s: BUSY\n", __func__); r = core->write(core, WL1273_INT_MASK_SET, radio->irq_flags); @@ -847,7 +847,7 @@ static int wl1273_fm_set_seek(struct wl1273_device *radio, if (r) goto out; - INIT_COMPLETION(radio->busy); + reinit_completion(&radio->busy); dev_dbg(radio->dev, "%s: BUSY\n", __func__); r = core->write(core, WL1273_TUNER_MODE_SET, TUNER_MODE_AUTO_SEEK); diff --git a/drivers/media/radio/si470x/radio-si470x-common.c b/drivers/media/radio/si470x/radio-si470x-common.c index 5c57e5b0f94..0bd25006828 100644 --- a/drivers/media/radio/si470x/radio-si470x-common.c +++ b/drivers/media/radio/si470x/radio-si470x-common.c @@ -218,7 +218,7 @@ static int si470x_set_chan(struct si470x_device *radio, unsigned short chan) goto done; /* wait till tune operation has completed */ - INIT_COMPLETION(radio->completion); + reinit_completion(&radio->completion); retval = wait_for_completion_timeout(&radio->completion, msecs_to_jiffies(tune_timeout)); if (!retval) @@ -341,7 +341,7 @@ static int si470x_set_seek(struct si470x_device *radio, return retval; /* wait till tune operation has completed */ - INIT_COMPLETION(radio->completion); + reinit_completion(&radio->completion); retval = wait_for_completion_timeout(&radio->completion, msecs_to_jiffies(seek_timeout)); if (!retval) diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c index 19632b1c219..b53626ba6f4 100644 --- a/drivers/media/rc/iguanair.c +++ b/drivers/media/rc/iguanair.c @@ -207,7 +207,7 @@ static int iguanair_send(struct iguanair *ir, unsigned size) { int rc; - INIT_COMPLETION(ir->completion); + reinit_completion(&ir->completion); ir->urb_out->transfer_buffer_length = size; rc = usb_submit_urb(ir->urb_out, GFP_KERNEL); diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c index bbf4aea1627..a0547dbf980 100644 --- a/drivers/memstick/core/memstick.c +++ b/drivers/memstick/core/memstick.c @@ -253,7 +253,7 @@ void memstick_new_req(struct memstick_host *host) { if (host->card) { host->retries = cmd_retries; - INIT_COMPLETION(host->card->mrq_complete); + reinit_completion(&host->card->mrq_complete); host->request(host); } } diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 9188ef5d677..24f2f8473de 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -401,10 +401,10 @@ again: sizeof(struct ms_status_register))) return 0; - msb->state = MSB_RP_RECIVE_STATUS_REG; + msb->state = MSB_RP_RECEIVE_STATUS_REG; return 0; - case MSB_RP_RECIVE_STATUS_REG: + case MSB_RP_RECEIVE_STATUS_REG: msb->regs.status = *(struct ms_status_register *)mrq->data; msb->state = MSB_RP_SEND_OOB_READ; /* fallthrough */ diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h index 96e63755098..c75198dbf13 100644 --- a/drivers/memstick/core/ms_block.h +++ b/drivers/memstick/core/ms_block.h @@ -223,7 +223,7 @@ enum msb_readpage_states { MSB_RP_RECEIVE_INT_REQ_RESULT, MSB_RP_SEND_READ_STATUS_REG, - MSB_RP_RECIVE_STATUS_REG, + MSB_RP_RECEIVE_STATUS_REG, MSB_RP_SEND_OOB_READ, MSB_RP_RECEIVE_OOB_READ, diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index 1b6e9134522..31727bf285d 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c @@ -290,7 +290,7 @@ static int r592_transfer_fifo_dma(struct r592_device *dev) dbg_verbose("doing dma transfer"); dev->dma_error = 0; - INIT_COMPLETION(dev->dma_done); + reinit_completion(&dev->dma_done); /* TODO: hidden assumption about nenth beeing always 1 */ sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ? diff --git a/drivers/misc/mic/card/mic_virtio.c b/drivers/misc/mic/card/mic_virtio.c index 914cc9b2caa..8aa42e738ac 100644 --- a/drivers/misc/mic/card/mic_virtio.c +++ b/drivers/misc/mic/card/mic_virtio.c @@ -493,7 +493,7 @@ static int mic_remove_device(struct mic_device_desc __iomem *d, ioread8(&dc->config_change), ioread8(&d->type), mvdev); status = ioread8(&d->status); - INIT_COMPLETION(mvdev->reset_done); + reinit_completion(&mvdev->reset_done); unregister_virtio_device(&mvdev->vdev); mic_free_card_irq(mvdev->virtio_cookie, mvdev); if (status & VIRTIO_CONFIG_S_DRIVER_OK) diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c index b079c65eed6..7558d918643 100644 --- a/drivers/misc/mic/host/mic_boot.c +++ b/drivers/misc/mic/host/mic_boot.c @@ -38,7 +38,7 @@ static void mic_reset(struct mic_device *mdev) #define MIC_RESET_TO (45) - INIT_COMPLETION(mdev->reset_wait); + reinit_completion(&mdev->reset_wait); mdev->ops->reset_fw_ready(mdev); mdev->ops->reset(mdev); diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c index 83907c72059..96853a09788 100644 --- a/drivers/misc/ti-st/st_kim.c +++ b/drivers/misc/ti-st/st_kim.c @@ -218,7 +218,7 @@ static long read_local_version(struct kim_data_s *kim_gdata, char *bts_scr_name) pr_debug("%s", __func__); - INIT_COMPLETION(kim_gdata->kim_rcvd); + reinit_completion(&kim_gdata->kim_rcvd); if (4 != st_int_write(kim_gdata->core_data, read_ver_cmd, 4)) { pr_err("kim: couldn't write 4 bytes"); return -EIO; @@ -229,7 +229,7 @@ static long read_local_version(struct kim_data_s *kim_gdata, char *bts_scr_name) pr_err(" waiting for ver info- timed out "); return -ETIMEDOUT; } - INIT_COMPLETION(kim_gdata->kim_rcvd); + reinit_completion(&kim_gdata->kim_rcvd); /* the positions 12 & 13 in the response buffer provide with the * chip, major & minor numbers */ @@ -362,7 +362,7 @@ static long download_firmware(struct kim_data_s *kim_gdata) /* reinit completion before sending for the * relevant wait */ - INIT_COMPLETION(kim_gdata->kim_rcvd); + reinit_completion(&kim_gdata->kim_rcvd); /* * Free space found in uart buffer, call st_int_write @@ -398,7 +398,7 @@ static long download_firmware(struct kim_data_s *kim_gdata) release_firmware(kim_gdata->fw_entry); return -ETIMEDOUT; } - INIT_COMPLETION(kim_gdata->kim_rcvd); + reinit_completion(&kim_gdata->kim_rcvd); break; case ACTION_DELAY: /* sleep */ pr_info("sleep command in scr"); @@ -474,7 +474,7 @@ long st_kim_start(void *kim_data) gpio_set_value(kim_gdata->nshutdown, GPIO_HIGH); mdelay(100); /* re-initialize the completion */ - INIT_COMPLETION(kim_gdata->ldisc_installed); + reinit_completion(&kim_gdata->ldisc_installed); /* send notification to UIM */ kim_gdata->ldisc_install = 1; pr_info("ldisc_install = 1"); @@ -525,7 +525,7 @@ long st_kim_stop(void *kim_data) kim_gdata->kim_pdev->dev.platform_data; struct tty_struct *tty = kim_gdata->core_data->tty; - INIT_COMPLETION(kim_gdata->ldisc_installed); + reinit_completion(&kim_gdata->ldisc_installed); if (tty) { /* can be called before ldisc is installed */ /* Flush any pending characters in the driver and discipline. */ diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c index 4edea7f4462..9dfdb06c508 100644 --- a/drivers/mtd/nand/mxc_nand.c +++ b/drivers/mtd/nand/mxc_nand.c @@ -396,7 +396,7 @@ static void wait_op_done(struct mxc_nand_host *host, int useirq) if (useirq) { if (!host->devtype_data->check_int(host)) { - INIT_COMPLETION(host->op_completion); + reinit_completion(&host->op_completion); irq_control(host, 1); wait_for_completion(&host->op_completion); } diff --git a/drivers/mtd/nand/r852.c b/drivers/mtd/nand/r852.c index 9dcf02d22aa..325930db3f0 100644 --- a/drivers/mtd/nand/r852.c +++ b/drivers/mtd/nand/r852.c @@ -181,7 +181,7 @@ static void r852_do_dma(struct r852_device *dev, uint8_t *buf, int do_read) /* Set dma direction */ dev->dma_dir = do_read; dev->dma_stage = 1; - INIT_COMPLETION(dev->dma_done); + reinit_completion(&dev->dma_done); dbg_verbose("doing dma %s ", do_read ? "read" : "write"); diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c index 2362909d20c..6547c84afc3 100644 --- a/drivers/mtd/onenand/omap2.c +++ b/drivers/mtd/onenand/omap2.c @@ -159,7 +159,7 @@ static int omap2_onenand_wait(struct mtd_info *mtd, int state) syscfg = read_reg(c, ONENAND_REG_SYS_CFG1); } - INIT_COMPLETION(c->irq_done); + reinit_completion(&c->irq_done); if (c->gpio_irq) { result = gpio_get_value(c->gpio_irq); if (result == -1) { @@ -349,7 +349,7 @@ static int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area, omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, dma_dst, 0, 0); - INIT_COMPLETION(c->dma_done); + reinit_completion(&c->dma_done); omap_start_dma(c->dma_channel); timeout = jiffies + msecs_to_jiffies(20); @@ -420,7 +420,7 @@ static int omap3_onenand_write_bufferram(struct mtd_info *mtd, int area, omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, dma_dst, 0, 0); - INIT_COMPLETION(c->dma_done); + reinit_completion(&c->dma_done); omap_start_dma(c->dma_channel); timeout = jiffies + msecs_to_jiffies(20); @@ -499,7 +499,7 @@ static int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area, omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, dma_dst, 0, 0); - INIT_COMPLETION(c->dma_done); + reinit_completion(&c->dma_done); omap_start_dma(c->dma_channel); wait_for_completion(&c->dma_done); @@ -544,7 +544,7 @@ static int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area, omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, dma_dst, 0, 0); - INIT_COMPLETION(c->dma_done); + reinit_completion(&c->dma_done); omap_start_dma(c->dma_channel); wait_for_completion(&c->dma_done); diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c index b9ed1288ce2..985608634f8 100644 --- a/drivers/net/caif/caif_virtio.c +++ b/drivers/net/caif/caif_virtio.c @@ -686,18 +686,19 @@ static int cfv_probe(struct virtio_device *vdev) goto err; /* Get the CAIF configuration from virtio config space, if available */ -#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \ - ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \ - &_var, \ - FIELD_SIZEOF(struct virtio_caif_transf_config, _f))) - if (vdev->config->get) { - GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom); - GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom); - GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom); - GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom); - GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu); - GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu); + virtio_cread(vdev, struct virtio_caif_transf_config, headroom, + &cfv->tx_hr); + virtio_cread(vdev, struct virtio_caif_transf_config, headroom, + &cfv->rx_hr); + virtio_cread(vdev, struct virtio_caif_transf_config, tailroom, + &cfv->tx_tr); + virtio_cread(vdev, struct virtio_caif_transf_config, tailroom, + &cfv->rx_tr); + virtio_cread(vdev, struct virtio_caif_transf_config, mtu, + &cfv->mtu); + virtio_cread(vdev, struct virtio_caif_transf_config, mtu, + &cfv->mru); } else { cfv->tx_hr = CFV_DEF_HEADROOM; cfv->rx_hr = CFV_DEF_HEADROOM; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index 09810ddd11e..a01a6a74ee3 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -3537,7 +3537,7 @@ int qlcnic_83xx_resume(struct qlcnic_adapter *adapter) void qlcnic_83xx_reinit_mbx_work(struct qlcnic_mailbox *mbx) { - INIT_COMPLETION(mbx->completion); + reinit_completion(&mbx->completion); set_bit(QLC_83XX_MBX_READY, &mbx->status); } diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index 6f10b496472..2cbe1c24999 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -561,7 +561,7 @@ at86rf230_xmit(struct ieee802154_dev *dev, struct sk_buff *skb) spin_lock_irqsave(&lp->lock, flags); lp->is_tx = 1; - INIT_COMPLETION(lp->tx_complete); + reinit_completion(&lp->tx_complete); spin_unlock_irqrestore(&lp->lock, flags); rc = at86rf230_write_fbuf(lp, skb->data, skb->len); diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c index 0632d34905c..c6e46d6e9f7 100644 --- a/drivers/net/ieee802154/mrf24j40.c +++ b/drivers/net/ieee802154/mrf24j40.c @@ -343,7 +343,7 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb) if (ret) goto err; - INIT_COMPLETION(devrec->tx_complete); + reinit_completion(&devrec->tx_complete); /* Set TXNTRIG bit of TXNCON to send packet */ ret = read_short_reg(devrec, REG_TXNCON, &val); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index bf7c734259a..cdc7c90a6a9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -591,7 +591,8 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp) } while (rq->vq->num_free); if (unlikely(rq->num > rq->max)) rq->max = rq->num; - virtqueue_kick(rq->vq); + if (unlikely(!virtqueue_kick(rq->vq))) + return false; return !oom; } @@ -797,7 +798,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) err = xmit_skb(sq, skb); /* This should not happen! */ - if (unlikely(err)) { + if (unlikely(err) || unlikely(!virtqueue_kick(sq->vq))) { dev->stats.tx_fifo_errors++; if (net_ratelimit()) dev_warn(&dev->dev, @@ -806,7 +807,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) kfree_skb(skb); return NETDEV_TX_OK; } - virtqueue_kick(sq->vq); /* Don't wait up for transmitted skbs to be freed. */ skb_orphan(skb); @@ -865,12 +865,14 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC) < 0); - virtqueue_kick(vi->cvq); + if (unlikely(!virtqueue_kick(vi->cvq))) + return status == VIRTIO_NET_OK; /* Spin for a response, the kick causes an ioport write, trapping * into the hypervisor, so the request should be handled immediately. */ - while (!virtqueue_get_buf(vi->cvq, &tmp)) + while (!virtqueue_get_buf(vi->cvq, &tmp) && + !virtqueue_is_broken(vi->cvq)) cpu_relax(); return status == VIRTIO_NET_OK; @@ -898,8 +900,13 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) return -EINVAL; } } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { - vdev->config->set(vdev, offsetof(struct virtio_net_config, mac), - addr->sa_data, dev->addr_len); + unsigned int i; + + /* Naturally, this has an atomicity problem. */ + for (i = 0; i < dev->addr_len; i++) + virtio_cwrite8(vdev, + offsetof(struct virtio_net_config, mac) + + i, addr->sa_data[i]); } eth_commit_mac_addr_change(dev, p); @@ -1281,9 +1288,8 @@ static void virtnet_config_changed_work(struct work_struct *work) if (!vi->config_enable) goto done; - if (virtio_config_val(vi->vdev, VIRTIO_NET_F_STATUS, - offsetof(struct virtio_net_config, status), - &v) < 0) + if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS, + struct virtio_net_config, status, &v) < 0) goto done; if (v & VIRTIO_NET_S_ANNOUNCE) { @@ -1507,9 +1513,9 @@ static int virtnet_probe(struct virtio_device *vdev) u16 max_queue_pairs; /* Find if host supports multiqueue virtio_net device */ - err = virtio_config_val(vdev, VIRTIO_NET_F_MQ, - offsetof(struct virtio_net_config, - max_virtqueue_pairs), &max_queue_pairs); + err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ, + struct virtio_net_config, + max_virtqueue_pairs, &max_queue_pairs); /* We need at least 2 queue's */ if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || @@ -1561,9 +1567,11 @@ static int virtnet_probe(struct virtio_device *vdev) dev->vlan_features = dev->features; /* Configuration may specify what MAC to use. Otherwise random. */ - if (virtio_config_val_len(vdev, VIRTIO_NET_F_MAC, - offsetof(struct virtio_net_config, mac), - dev->dev_addr, dev->addr_len) < 0) + if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + virtio_cread_bytes(vdev, + offsetof(struct virtio_net_config, mac), + dev->dev_addr, dev->addr_len); + else eth_hw_addr_random(dev); /* Set up our device-specific information */ @@ -1704,7 +1712,7 @@ static void virtnet_remove(struct virtio_device *vdev) free_netdev(vi->dev); } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int virtnet_freeze(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; @@ -1795,7 +1803,7 @@ static struct virtio_driver virtio_net_driver = { .probe = virtnet_probe, .remove = virtnet_remove, .config_changed = virtnet_config_changed, -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP .freeze = virtnet_freeze, .restore = virtnet_restore, #endif diff --git a/drivers/net/wireless/ath/ath10k/htc.c b/drivers/net/wireless/ath/ath10k/htc.c index 3118d750673..edae50b5280 100644 --- a/drivers/net/wireless/ath/ath10k/htc.c +++ b/drivers/net/wireless/ath/ath10k/htc.c @@ -534,7 +534,7 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc) u16 credit_count; u16 credit_size; - INIT_COMPLETION(htc->ctl_resp); + reinit_completion(&htc->ctl_resp); status = ath10k_hif_start(htc->ar); if (status) { @@ -669,7 +669,7 @@ int ath10k_htc_connect_service(struct ath10k_htc *htc, req_msg->flags = __cpu_to_le16(flags); req_msg->service_id = __cpu_to_le16(conn_req->service_id); - INIT_COMPLETION(htc->ctl_resp); + reinit_completion(&htc->ctl_resp); status = ath10k_htc_send(htc, ATH10K_HTC_EP_0, skb); if (status) { diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 0b1cc516e77..97ac8c87cba 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -92,7 +92,7 @@ static int ath10k_install_key(struct ath10k_vif *arvif, lockdep_assert_held(&ar->conf_mutex); - INIT_COMPLETION(ar->install_key_done); + reinit_completion(&ar->install_key_done); ret = ath10k_send_key(arvif, key, cmd, macaddr); if (ret) @@ -438,7 +438,7 @@ static int ath10k_vdev_start(struct ath10k_vif *arvif) lockdep_assert_held(&ar->conf_mutex); - INIT_COMPLETION(ar->vdev_setup_done); + reinit_completion(&ar->vdev_setup_done); arg.vdev_id = arvif->vdev_id; arg.dtim_period = arvif->dtim_period; @@ -491,7 +491,7 @@ static int ath10k_vdev_stop(struct ath10k_vif *arvif) lockdep_assert_held(&ar->conf_mutex); - INIT_COMPLETION(ar->vdev_setup_done); + reinit_completion(&ar->vdev_setup_done); ret = ath10k_wmi_vdev_stop(ar, arvif->vdev_id); if (ret) { @@ -1666,7 +1666,7 @@ void ath10k_offchan_tx_work(struct work_struct *work) } spin_lock_bh(&ar->data_lock); - INIT_COMPLETION(ar->offchan_tx_completed); + reinit_completion(&ar->offchan_tx_completed); ar->offchan_tx_skb = skb; spin_unlock_bh(&ar->data_lock); @@ -2476,8 +2476,8 @@ static int ath10k_hw_scan(struct ieee80211_hw *hw, goto exit; } - INIT_COMPLETION(ar->scan.started); - INIT_COMPLETION(ar->scan.completed); + reinit_completion(&ar->scan.started); + reinit_completion(&ar->scan.completed); ar->scan.in_progress = true; ar->scan.aborting = false; ar->scan.is_roc = false; @@ -2832,9 +2832,9 @@ static int ath10k_remain_on_channel(struct ieee80211_hw *hw, goto exit; } - INIT_COMPLETION(ar->scan.started); - INIT_COMPLETION(ar->scan.completed); - INIT_COMPLETION(ar->scan.on_channel); + reinit_completion(&ar->scan.started); + reinit_completion(&ar->scan.completed); + reinit_completion(&ar->scan.on_channel); ar->scan.in_progress = true; ar->scan.aborting = false; ar->scan.is_roc = true; diff --git a/drivers/net/wireless/ath/carl9170/usb.c b/drivers/net/wireless/ath/carl9170/usb.c index 307bc0ddff9..ca115f33746 100644 --- a/drivers/net/wireless/ath/carl9170/usb.c +++ b/drivers/net/wireless/ath/carl9170/usb.c @@ -773,7 +773,7 @@ void carl9170_usb_stop(struct ar9170 *ar) complete_all(&ar->cmd_wait); /* This is required to prevent an early completion on _start */ - INIT_COMPLETION(ar->cmd_wait); + reinit_completion(&ar->cmd_wait); /* * Note: diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c index 0a2844c48a6..fd30cddd588 100644 --- a/drivers/net/wireless/ath/wil6210/main.c +++ b/drivers/net/wireless/ath/wil6210/main.c @@ -250,7 +250,7 @@ int wil_reset(struct wil6210_priv *wil) /* init after reset */ wil->pending_connect_cid = -1; - INIT_COMPLETION(wil->wmi_ready); + reinit_completion(&wil->wmi_ready); /* TODO: release MAC reset */ wil6210_enable_irq(wil); diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index d7a97453290..5b5b952d47b 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c @@ -1148,7 +1148,7 @@ static s32 brcmf_p2p_af_searching_channel(struct brcmf_p2p_info *p2p) pri_vif = p2p->bss_idx[P2PAPI_BSSCFG_PRIMARY].vif; - INIT_COMPLETION(afx_hdl->act_frm_scan); + reinit_completion(&afx_hdl->act_frm_scan); set_bit(BRCMF_P2P_STATUS_FINDING_COMMON_CHANNEL, &p2p->status); afx_hdl->is_active = true; afx_hdl->peer_chan = P2P_INVALID_CHANNEL; @@ -1501,7 +1501,7 @@ static s32 brcmf_p2p_tx_action_frame(struct brcmf_p2p_info *p2p, brcmf_dbg(TRACE, "Enter\n"); - INIT_COMPLETION(p2p->send_af_done); + reinit_completion(&p2p->send_af_done); clear_bit(BRCMF_P2P_STATUS_ACTION_TX_COMPLETED, &p2p->status); clear_bit(BRCMF_P2P_STATUS_ACTION_TX_NOACK, &p2p->status); diff --git a/drivers/net/wireless/rt2x00/rt2800mmio.c b/drivers/net/wireless/rt2x00/rt2800mmio.c index ae152280e07..a8cc736b506 100644 --- a/drivers/net/wireless/rt2x00/rt2800mmio.c +++ b/drivers/net/wireless/rt2x00/rt2800mmio.c @@ -446,7 +446,7 @@ static void rt2800mmio_txstatus_interrupt(struct rt2x00_dev *rt2x00dev) if (!rt2x00_get_field32(status, TX_STA_FIFO_VALID)) break; - if (!kfifo_put(&rt2x00dev->txstatus_fifo, &status)) { + if (!kfifo_put(&rt2x00dev->txstatus_fifo, status)) { rt2x00_warn(rt2x00dev, "TX status FIFO overrun, drop tx status report\n"); break; } diff --git a/drivers/net/wireless/rt2x00/rt2800usb.c b/drivers/net/wireless/rt2x00/rt2800usb.c index 997df03a0c2..a81ceb61d74 100644 --- a/drivers/net/wireless/rt2x00/rt2800usb.c +++ b/drivers/net/wireless/rt2x00/rt2800usb.c @@ -164,7 +164,7 @@ static bool rt2800usb_tx_sta_fifo_read_completed(struct rt2x00_dev *rt2x00dev, valid = rt2x00_get_field32(tx_status, TX_STA_FIFO_VALID); if (valid) { - if (!kfifo_put(&rt2x00dev->txstatus_fifo, &tx_status)) + if (!kfifo_put(&rt2x00dev->txstatus_fifo, tx_status)) rt2x00_warn(rt2x00dev, "TX status FIFO overrun\n"); queue_work(rt2x00dev->workqueue, &rt2x00dev->txdone_work); diff --git a/drivers/net/wireless/zd1211rw/zd_usb.c b/drivers/net/wireless/zd1211rw/zd_usb.c index 7ef0b4a181e..84d94f572a4 100644 --- a/drivers/net/wireless/zd1211rw/zd_usb.c +++ b/drivers/net/wireless/zd1211rw/zd_usb.c @@ -1619,7 +1619,7 @@ static void prepare_read_regs_int(struct zd_usb *usb, atomic_set(&intr->read_regs_enabled, 1); intr->read_regs.req = req; intr->read_regs.req_count = count; - INIT_COMPLETION(intr->read_regs.completion); + reinit_completion(&intr->read_regs.completion); spin_unlock_irq(&intr->lock); } diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c index d4716273651..c864f82bd37 100644 --- a/drivers/parport/parport_ip32.c +++ b/drivers/parport/parport_ip32.c @@ -1331,7 +1331,7 @@ static unsigned int parport_ip32_fwp_wait_interrupt(struct parport *p) break; /* Initialize mutex used to take interrupts into account */ - INIT_COMPLETION(priv->irq_complete); + reinit_completion(&priv->irq_complete); /* Enable serviceIntr */ parport_ip32_frob_econtrol(p, ECR_SERVINTR, 0); @@ -1446,7 +1446,7 @@ static size_t parport_ip32_fifo_write_block_dma(struct parport *p, priv->irq_mode = PARPORT_IP32_IRQ_HERE; parport_ip32_dma_start(DMA_TO_DEVICE, (void *)buf, len); - INIT_COMPLETION(priv->irq_complete); + reinit_completion(&priv->irq_complete); parport_ip32_frob_econtrol(p, ECR_DMAEN | ECR_SERVINTR, ECR_DMAEN); nfault_timeout = min((unsigned long)physport->cad->timeout, diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index 85ca36f2136..6b3a958e1be 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c @@ -574,7 +574,7 @@ void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, }; spin_lock_irqsave(&aer_recover_ring_lock, flags); - if (kfifo_put(&aer_recover_ring, &entry)) + if (kfifo_put(&aer_recover_ring, entry)) schedule_work(&aer_recover_work); else pr_err("AER recover: Buffer overflow when recovering AER for %04x:%02x:%02x:%x\n", diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c index 8eea2efbbb6..605a9be5512 100644 --- a/drivers/platform/x86/apple-gmux.c +++ b/drivers/platform/x86/apple-gmux.c @@ -289,7 +289,7 @@ static int gmux_switchto(enum vga_switcheroo_client_id id) static int gmux_set_discrete_state(struct apple_gmux_data *gmux_data, enum vga_switcheroo_state state) { - INIT_COMPLETION(gmux_data->powerchange_done); + reinit_completion(&gmux_data->powerchange_done); if (state == VGA_SWITCHEROO_ON) { gmux_write8(gmux_data, GMUX_PORT_DISCRETE_POWER, 1); diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c index 754970717c3..3cb4178e397 100644 --- a/drivers/power/ab8500_fg.c +++ b/drivers/power/ab8500_fg.c @@ -574,8 +574,8 @@ int ab8500_fg_inst_curr_start(struct ab8500_fg *di) } /* Return and WFI */ - INIT_COMPLETION(di->ab8500_fg_started); - INIT_COMPLETION(di->ab8500_fg_complete); + reinit_completion(&di->ab8500_fg_started); + reinit_completion(&di->ab8500_fg_complete); enable_irq(di->irq); /* Note: cc_lock is still locked */ diff --git a/drivers/power/jz4740-battery.c b/drivers/power/jz4740-battery.c index d9686aa9270..6c8931d4ad6 100644 --- a/drivers/power/jz4740-battery.c +++ b/drivers/power/jz4740-battery.c @@ -73,7 +73,7 @@ static long jz_battery_read_voltage(struct jz_battery *battery) mutex_lock(&battery->lock); - INIT_COMPLETION(battery->read_completion); + reinit_completion(&battery->read_completion); enable_irq(battery->irq); battery->cell->enable(battery->pdev); diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c index b09c75c21b6..a34b50690b4 100644 --- a/drivers/remoteproc/remoteproc_virtio.c +++ b/drivers/remoteproc/remoteproc_virtio.c @@ -30,7 +30,7 @@ #include "remoteproc_internal.h" /* kick the remote processor, and let it know which virtqueue to poke at */ -static void rproc_virtio_notify(struct virtqueue *vq) +static bool rproc_virtio_notify(struct virtqueue *vq) { struct rproc_vring *rvring = vq->priv; struct rproc *rproc = rvring->rvdev->rproc; @@ -39,6 +39,7 @@ static void rproc_virtio_notify(struct virtqueue *vq) dev_dbg(&rproc->dev, "kicking vq index: %d\n", notifyid); rproc->ops->kick(rproc, notifyid); + return true; } /** diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index 45560ffb038..965a9da7086 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -209,7 +209,7 @@ static int hid_rtc_read_time(struct device *dev, struct rtc_time *tm) platform_get_drvdata(to_platform_device(dev)); int ret; - INIT_COMPLETION(time_state->comp_last_time); + reinit_completion(&time_state->comp_last_time); /* get a report with all values through requesting one value */ sensor_hub_input_attr_get_raw_value(time_state->common_attributes.hsdev, HID_USAGE_SENSOR_TIME, hid_time_addresses[0], @@ -236,7 +236,7 @@ static const struct rtc_class_ops hid_time_rtc_ops = { static int hid_time_probe(struct platform_device *pdev) { int ret = 0; - struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data; + struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev); struct hid_time_state *time_state = devm_kzalloc(&pdev->dev, sizeof(struct hid_time_state), GFP_KERNEL); @@ -281,11 +281,18 @@ static int hid_time_probe(struct platform_device *pdev) goto err_open; } + /* + * Enable HID input processing early in order to be able to read the + * clock already in devm_rtc_device_register(). + */ + hid_device_io_start(hsdev->hdev); + time_state->rtc = devm_rtc_device_register(&pdev->dev, "hid-sensor-time", &hid_time_rtc_ops, THIS_MODULE); if (IS_ERR_OR_NULL(time_state->rtc)) { + hid_device_io_stop(hsdev->hdev); ret = time_state->rtc ? PTR_ERR(time_state->rtc) : -ENODEV; time_state->rtc = NULL; dev_err(&pdev->dev, "rtc device register failed!\n"); @@ -303,7 +310,7 @@ err_open: static int hid_time_remove(struct platform_device *pdev) { - struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data; + struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev); sensor_hub_device_close(hsdev); sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME); diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index af2166fa515..1abd0db2991 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -166,11 +166,15 @@ static void kvm_reset(struct virtio_device *vdev) * make a hypercall. We hand the address of the virtqueue so the Host * knows which virtqueue we're talking about. */ -static void kvm_notify(struct virtqueue *vq) +static bool kvm_notify(struct virtqueue *vq) { + long rc; struct kvm_vqconfig *config = vq->priv; - kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, config->address); + rc = kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, config->address); + if (rc < 0) + return false; + return true; } /* diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 779dc513629..d6297176ab8 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -162,7 +162,7 @@ static inline long do_kvm_notify(struct subchannel_id schid, return __rc; } -static void virtio_ccw_kvm_notify(struct virtqueue *vq) +static bool virtio_ccw_kvm_notify(struct virtqueue *vq) { struct virtio_ccw_vq_info *info = vq->priv; struct virtio_ccw_device *vcdev; @@ -171,6 +171,9 @@ static void virtio_ccw_kvm_notify(struct virtqueue *vq) vcdev = to_vc_device(info->vq->vdev); ccw_device_get_schid(vcdev->cdev, &schid); info->cookie = do_kvm_notify(schid, vq->index, info->cookie); + if (info->cookie < 0) + return false; + return true; } static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev, diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 74b88efde6a..c3173dced87 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -224,6 +224,9 @@ static void virtscsi_vq_done(struct virtio_scsi *vscsi, virtqueue_disable_cb(vq); while ((buf = virtqueue_get_buf(vq, &len)) != NULL) fn(vscsi, buf); + + if (unlikely(virtqueue_is_broken(vq))) + break; } while (!virtqueue_enable_cb(vq)); spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); } @@ -710,19 +713,15 @@ static struct scsi_host_template virtscsi_host_template_multi = { #define virtscsi_config_get(vdev, fld) \ ({ \ typeof(((struct virtio_scsi_config *)0)->fld) __val; \ - vdev->config->get(vdev, \ - offsetof(struct virtio_scsi_config, fld), \ - &__val, sizeof(__val)); \ + virtio_cread(vdev, struct virtio_scsi_config, fld, &__val); \ __val; \ }) #define virtscsi_config_set(vdev, fld, val) \ - (void)({ \ + do { \ typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \ - vdev->config->set(vdev, \ - offsetof(struct virtio_scsi_config, fld), \ - &__val, sizeof(__val)); \ - }) + virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \ + } while(0) static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) { @@ -954,7 +953,7 @@ static void virtscsi_remove(struct virtio_device *vdev) scsi_host_put(shost); } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int virtscsi_freeze(struct virtio_device *vdev) { virtscsi_remove_vqs(vdev); @@ -988,7 +987,7 @@ static struct virtio_driver virtio_scsi_driver = { .id_table = id_table, .probe = virtscsi_probe, .scan = virtscsi_scan, -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP .freeze = virtscsi_freeze, .restore = virtscsi_restore, #endif diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c index 4c332143a31..3ed666fe840 100644 --- a/drivers/spi/spi-bcm2835.c +++ b/drivers/spi/spi-bcm2835.c @@ -217,7 +217,7 @@ static int bcm2835_spi_start_transfer(struct spi_device *spi, cs |= spi->chip_select; } - INIT_COMPLETION(bs->done); + reinit_completion(&bs->done); bs->tx_buf = tfr->tx_buf; bs->rx_buf = tfr->rx_buf; bs->len = tfr->len; diff --git a/drivers/spi/spi-clps711x.c b/drivers/spi/spi-clps711x.c index e2a5a426b2e..6f03d7e6435 100644 --- a/drivers/spi/spi-clps711x.c +++ b/drivers/spi/spi-clps711x.c @@ -105,7 +105,7 @@ static int spi_clps711x_transfer_one_message(struct spi_master *master, gpio_set_value(cs, !!(msg->spi->mode & SPI_CS_HIGH)); - INIT_COMPLETION(hw->done); + reinit_completion(&hw->done); hw->count = 0; hw->len = xfer->len; diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c index dd72445ba2e..50b2d88c819 100644 --- a/drivers/spi/spi-davinci.c +++ b/drivers/spi/spi-davinci.c @@ -554,7 +554,7 @@ static int davinci_spi_bufs(struct spi_device *spi, struct spi_transfer *t) clear_io_bits(dspi->base + SPIGCR1, SPIGCR1_POWERDOWN_MASK); set_io_bits(dspi->base + SPIGCR1, SPIGCR1_SPIENA_MASK); - INIT_COMPLETION(dspi->done); + reinit_completion(&dspi->done); if (spicfg->io_type == SPI_IO_TYPE_INTR) set_io_bits(dspi->base + SPIINT, SPIINT_MASKINT); diff --git a/drivers/spi/spi-fsl-espi.c b/drivers/spi/spi-fsl-espi.c index 32200d4f878..80d8f40f7e0 100644 --- a/drivers/spi/spi-fsl-espi.c +++ b/drivers/spi/spi-fsl-espi.c @@ -232,7 +232,7 @@ static int fsl_espi_bufs(struct spi_device *spi, struct spi_transfer *t) mpc8xxx_spi->tx = t->tx_buf; mpc8xxx_spi->rx = t->rx_buf; - INIT_COMPLETION(mpc8xxx_spi->done); + reinit_completion(&mpc8xxx_spi->done); /* Set SPCOM[CS] and SPCOM[TRANLEN] field */ if ((t->len - 1) > SPCOM_TRANLEN_MAX) { diff --git a/drivers/spi/spi-fsl-spi.c b/drivers/spi/spi-fsl-spi.c index 2129fcd1c31..119f7af9453 100644 --- a/drivers/spi/spi-fsl-spi.c +++ b/drivers/spi/spi-fsl-spi.c @@ -339,7 +339,7 @@ static int fsl_spi_bufs(struct spi_device *spi, struct spi_transfer *t, mpc8xxx_spi->tx = t->tx_buf; mpc8xxx_spi->rx = t->rx_buf; - INIT_COMPLETION(mpc8xxx_spi->done); + reinit_completion(&mpc8xxx_spi->done); if (mpc8xxx_spi->flags & SPI_CPM_MODE) ret = fsl_spi_cpm_bufs(mpc8xxx_spi, t, is_dma_mapped); diff --git a/drivers/spi/spi-mpc512x-psc.c b/drivers/spi/spi-mpc512x-psc.c index 58d5ee0e444..9602bbd8d7e 100644 --- a/drivers/spi/spi-mpc512x-psc.c +++ b/drivers/spi/spi-mpc512x-psc.c @@ -167,7 +167,7 @@ static int mpc512x_psc_spi_transfer_rxtx(struct spi_device *spi, } /* have the ISR trigger when the TX FIFO is empty */ - INIT_COMPLETION(mps->txisrdone); + reinit_completion(&mps->txisrdone); out_be32(&fifo->txisr, MPC512x_PSC_FIFO_EMPTY); out_be32(&fifo->tximr, MPC512x_PSC_FIFO_EMPTY); wait_for_completion(&mps->txisrdone); diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c index de333059a9a..73afb56c08c 100644 --- a/drivers/spi/spi-mxs.c +++ b/drivers/spi/spi-mxs.c @@ -202,7 +202,7 @@ static int mxs_spi_txrx_dma(struct mxs_spi *spi, if (!dma_xfer) return -ENOMEM; - INIT_COMPLETION(spi->c); + reinit_completion(&spi->c); /* Chip select was already programmed into CTRL0 */ ctrl0 = readl(ssp->base + HW_SSP_CTRL0); diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index 9e2020df9e0..4c4b0a1219a 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -890,7 +890,7 @@ static int s3c64xx_spi_transfer_one(struct spi_master *master, unsigned long flags; int use_dma; - INIT_COMPLETION(sdd->xfer_completion); + reinit_completion(&sdd->xfer_completion); /* Only BPW and Speed may change across transfers */ bpw = xfer->bits_per_word; diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index 2a95435a6a1..c74298cf70e 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -465,7 +465,7 @@ static int sh_msiof_spi_txrx_once(struct sh_msiof_spi_priv *p, ret = ret ? ret : sh_msiof_modify_ctr_wait(p, 0, CTR_TXE); /* start by setting frame bit */ - INIT_COMPLETION(p->done); + reinit_completion(&p->done); ret = ret ? ret : sh_msiof_modify_ctr_wait(p, 0, CTR_TFSE); if (ret) { dev_err(&p->pdev->dev, "failed to start hardware\n"); diff --git a/drivers/spi/spi-sirf.c b/drivers/spi/spi-sirf.c index 592b4aff651..ed5e501c465 100644 --- a/drivers/spi/spi-sirf.c +++ b/drivers/spi/spi-sirf.c @@ -305,8 +305,8 @@ static int spi_sirfsoc_transfer(struct spi_device *spi, struct spi_transfer *t) sspi->tx = t->tx_buf ? t->tx_buf : sspi->dummypage; sspi->rx = t->rx_buf ? t->rx_buf : sspi->dummypage; sspi->left_tx_word = sspi->left_rx_word = t->len / sspi->word_width; - INIT_COMPLETION(sspi->rx_done); - INIT_COMPLETION(sspi->tx_done); + reinit_completion(&sspi->rx_done); + reinit_completion(&sspi->tx_done); writel(SIRFSOC_SPI_INT_MASK_ALL, sspi->base + SIRFSOC_SPI_INT_STATUS); diff --git a/drivers/spi/spi-tegra114.c b/drivers/spi/spi-tegra114.c index 9146bb3c248..aaecfb3ebf5 100644 --- a/drivers/spi/spi-tegra114.c +++ b/drivers/spi/spi-tegra114.c @@ -451,7 +451,7 @@ static void tegra_spi_dma_complete(void *args) static int tegra_spi_start_tx_dma(struct tegra_spi_data *tspi, int len) { - INIT_COMPLETION(tspi->tx_dma_complete); + reinit_completion(&tspi->tx_dma_complete); tspi->tx_dma_desc = dmaengine_prep_slave_single(tspi->tx_dma_chan, tspi->tx_dma_phys, len, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); @@ -470,7 +470,7 @@ static int tegra_spi_start_tx_dma(struct tegra_spi_data *tspi, int len) static int tegra_spi_start_rx_dma(struct tegra_spi_data *tspi, int len) { - INIT_COMPLETION(tspi->rx_dma_complete); + reinit_completion(&tspi->rx_dma_complete); tspi->rx_dma_desc = dmaengine_prep_slave_single(tspi->rx_dma_chan, tspi->rx_dma_phys, len, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); @@ -844,7 +844,7 @@ static int tegra_spi_transfer_one_message(struct spi_master *master, list_for_each_entry(xfer, &msg->transfers, transfer_list) { unsigned long cmd1; - INIT_COMPLETION(tspi->xfer_completion); + reinit_completion(&tspi->xfer_completion); cmd1 = tegra_spi_setup_transfer_one(spi, xfer, is_first_msg); diff --git a/drivers/spi/spi-tegra20-sflash.c b/drivers/spi/spi-tegra20-sflash.c index 79be8ce6a9d..4dc8e812945 100644 --- a/drivers/spi/spi-tegra20-sflash.c +++ b/drivers/spi/spi-tegra20-sflash.c @@ -339,7 +339,7 @@ static int tegra_sflash_transfer_one_message(struct spi_master *master, msg->actual_length = 0; single_xfer = list_is_singular(&msg->transfers); list_for_each_entry(xfer, &msg->transfers, transfer_list) { - INIT_COMPLETION(tsd->xfer_completion); + reinit_completion(&tsd->xfer_completion); ret = tegra_sflash_start_transfer_one(spi, xfer, is_first_msg, single_xfer); if (ret < 0) { diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c index af0a67886ae..e66715ba37e 100644 --- a/drivers/spi/spi-tegra20-slink.c +++ b/drivers/spi/spi-tegra20-slink.c @@ -462,7 +462,7 @@ static void tegra_slink_dma_complete(void *args) static int tegra_slink_start_tx_dma(struct tegra_slink_data *tspi, int len) { - INIT_COMPLETION(tspi->tx_dma_complete); + reinit_completion(&tspi->tx_dma_complete); tspi->tx_dma_desc = dmaengine_prep_slave_single(tspi->tx_dma_chan, tspi->tx_dma_phys, len, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); @@ -481,7 +481,7 @@ static int tegra_slink_start_tx_dma(struct tegra_slink_data *tspi, int len) static int tegra_slink_start_rx_dma(struct tegra_slink_data *tspi, int len) { - INIT_COMPLETION(tspi->rx_dma_complete); + reinit_completion(&tspi->rx_dma_complete); tspi->rx_dma_desc = dmaengine_prep_slave_single(tspi->rx_dma_chan, tspi->rx_dma_phys, len, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); @@ -836,7 +836,7 @@ static int tegra_slink_transfer_one(struct spi_master *master, struct tegra_slink_data *tspi = spi_master_get_devdata(master); int ret; - INIT_COMPLETION(tspi->xfer_completion); + reinit_completion(&tspi->xfer_completion); ret = tegra_slink_start_transfer_one(spi, xfer); if (ret < 0) { dev_err(tspi->dev, diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c index ec3a83f52ea..6d4ce461516 100644 --- a/drivers/spi/spi-xilinx.c +++ b/drivers/spi/spi-xilinx.c @@ -258,7 +258,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t) xspi->tx_ptr = t->tx_buf; xspi->rx_ptr = t->rx_buf; xspi->remaining_bytes = t->len; - INIT_COMPLETION(xspi->done); + reinit_completion(&xspi->done); /* Enable the transmit empty interrupt, which we use to determine diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 927998aa5e7..8d85ddc4601 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -571,7 +571,7 @@ static int spi_transfer_one_message(struct spi_master *master, list_for_each_entry(xfer, &msg->transfers, transfer_list) { trace_spi_transfer_start(msg, xfer); - INIT_COMPLETION(master->xfer_completion); + reinit_completion(&master->xfer_completion); ret = master->transfer_one(master, msg->spi, xfer); if (ret < 0) { diff --git a/drivers/staging/iio/adc/mxs-lradc.c b/drivers/staging/iio/adc/mxs-lradc.c index aeae76b77be..e2dd7830b32 100644 --- a/drivers/staging/iio/adc/mxs-lradc.c +++ b/drivers/staging/iio/adc/mxs-lradc.c @@ -783,7 +783,7 @@ static int mxs_lradc_read_raw(struct iio_dev *iio_dev, if (!ret) return -EBUSY; - INIT_COMPLETION(lradc->completion); + reinit_completion(&lradc->completion); /* * No buffered operation in progress, map the channel and trigger it. diff --git a/drivers/staging/media/solo6x10/solo6x10-p2m.c b/drivers/staging/media/solo6x10/solo6x10-p2m.c index 333594189b8..7f2f2472655 100644 --- a/drivers/staging/media/solo6x10/solo6x10-p2m.c +++ b/drivers/staging/media/solo6x10/solo6x10-p2m.c @@ -87,7 +87,7 @@ int solo_p2m_dma_desc(struct solo_dev *solo_dev, if (mutex_lock_interruptible(&p2m_dev->mutex)) return -EINTR; - INIT_COMPLETION(p2m_dev->completion); + reinit_completion(&p2m_dev->completion); p2m_dev->error = 0; if (desc_cnt > 1 && solo_dev->type != SOLO_DEV_6110 && desc_mode) { diff --git a/drivers/staging/tidspbridge/core/sync.c b/drivers/staging/tidspbridge/core/sync.c index 7bb550acaf4..743ff09d82d 100644 --- a/drivers/staging/tidspbridge/core/sync.c +++ b/drivers/staging/tidspbridge/core/sync.c @@ -72,7 +72,7 @@ int sync_wait_on_multiple_events(struct sync_object **events, spin_lock_bh(&sync_lock); for (i = 0; i < count; i++) { if (completion_done(&events[i]->comp)) { - INIT_COMPLETION(events[i]->comp); + reinit_completion(&events[i]->comp); *index = i; spin_unlock_bh(&sync_lock); status = 0; @@ -92,7 +92,7 @@ int sync_wait_on_multiple_events(struct sync_object **events, spin_lock_bh(&sync_lock); for (i = 0; i < count; i++) { if (completion_done(&events[i]->comp)) { - INIT_COMPLETION(events[i]->comp); + reinit_completion(&events[i]->comp); *index = i; status = 0; } diff --git a/drivers/staging/tidspbridge/include/dspbridge/sync.h b/drivers/staging/tidspbridge/include/dspbridge/sync.h index 58a0d5c5543..fc19b970708 100644 --- a/drivers/staging/tidspbridge/include/dspbridge/sync.h +++ b/drivers/staging/tidspbridge/include/dspbridge/sync.h @@ -59,7 +59,7 @@ static inline void sync_init_event(struct sync_object *event) static inline void sync_reset_event(struct sync_object *event) { - INIT_COMPLETION(event->comp); + reinit_completion(&event->comp); event->multi_comp = NULL; } diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c index 6d04eb48bfb..1aa4a3fd0f1 100644 --- a/drivers/staging/tidspbridge/rmgr/drv_interface.c +++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c @@ -332,7 +332,7 @@ static void bridge_recover(struct work_struct *work) struct dev_object *dev; struct cfg_devnode *dev_node; if (atomic_read(&bridge_cref)) { - INIT_COMPLETION(bridge_comp); + reinit_completion(&bridge_comp); while (!wait_for_completion_timeout(&bridge_comp, msecs_to_jiffies(REC_TIMEOUT))) pr_info("%s:%d handle(s) still opened\n", @@ -348,7 +348,7 @@ static void bridge_recover(struct work_struct *work) void bridge_recover_schedule(void) { - INIT_COMPLETION(bridge_open_comp); + reinit_completion(&bridge_open_comp); recover = true; queue_work(bridge_rec_queue, &bridge_recovery_work); } @@ -389,7 +389,7 @@ static int omap3_bridge_startup(struct platform_device *pdev) #ifdef CONFIG_TIDSPBRIDGE_RECOVERY bridge_rec_queue = create_workqueue("bridge_rec_queue"); INIT_WORK(&bridge_recovery_work, bridge_recover); - INIT_COMPLETION(bridge_comp); + reinit_completion(&bridge_comp); #endif #ifdef CONFIG_PM diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index c193af6a628..636c9baad7a 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -183,7 +183,7 @@ static int dom0_write_console(uint32_t vtermno, const char *str, int len) { int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); if (rc < 0) - return 0; + return rc; return len; } @@ -642,7 +642,22 @@ struct console xenboot_console = { void xen_raw_console_write(const char *str) { - dom0_write_console(0, str, strlen(str)); + ssize_t len = strlen(str); + int rc = 0; + + if (xen_domain()) { + rc = dom0_write_console(0, str, len); +#ifdef CONFIG_X86 + if (rc == -ENOSYS && xen_hvm_domain()) + goto outb_print; + + } else if (xen_cpuid_base()) { + int i; +outb_print: + for (i = 0; i < len; i++) + outb(str[i], 0xe9); +#endif + } } void xen_raw_printk(const char *fmt, ...) diff --git a/drivers/tty/metag_da.c b/drivers/tty/metag_da.c index 0e888621f48..7332e2ca461 100644 --- a/drivers/tty/metag_da.c +++ b/drivers/tty/metag_da.c @@ -495,7 +495,7 @@ static int dashtty_write(struct tty_struct *tty, const unsigned char *buf, count = dport->xmit_cnt; /* xmit buffer no longer empty? */ if (count) - INIT_COMPLETION(dport->xmit_empty); + reinit_completion(&dport->xmit_empty); mutex_unlock(&dport->xmit_lock); if (total) { diff --git a/drivers/usb/c67x00/c67x00-sched.c b/drivers/usb/c67x00/c67x00-sched.c index aa491627a45..892cc96466e 100644 --- a/drivers/usb/c67x00/c67x00-sched.c +++ b/drivers/usb/c67x00/c67x00-sched.c @@ -344,7 +344,7 @@ void c67x00_endpoint_disable(struct usb_hcd *hcd, struct usb_host_endpoint *ep) /* it could happen that we reinitialize this completion, while * somebody was waiting for that completion. The timeout and * while loop handle such cases, but this might be improved */ - INIT_COMPLETION(c67x00->endpoint_disable); + reinit_completion(&c67x00->endpoint_disable); c67x00_sched_kick(c67x00); wait_for_completion_timeout(&c67x00->endpoint_disable, 1 * HZ); diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c index 44cf775a862..774e8b89cdb 100644 --- a/drivers/usb/gadget/f_fs.c +++ b/drivers/usb/gadget/f_fs.c @@ -373,7 +373,7 @@ static int __ffs_ep0_queue_wait(struct ffs_data *ffs, char *data, size_t len) if (req->buf == NULL) req->buf = (void *)0xDEADBABE; - INIT_COMPLETION(ffs->ep0req_completion); + reinit_completion(&ffs->ep0req_completion); ret = usb_ep_queue(ffs->gadget->ep0, req, GFP_ATOMIC); if (unlikely(ret < 0)) diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c index 84657e07dc5..439c951f261 100644 --- a/drivers/usb/serial/mos7720.c +++ b/drivers/usb/serial/mos7720.c @@ -455,7 +455,7 @@ static int parport_prologue(struct parport *pp) return -1; } mos_parport->msg_pending = true; /* synch usb call pending */ - INIT_COMPLETION(mos_parport->syncmsg_compl); + reinit_completion(&mos_parport->syncmsg_compl); spin_unlock(&release_lock); mutex_lock(&mos_parport->serial->disc_mutex); diff --git a/drivers/video/exynos/exynos_mipi_dsi_common.c b/drivers/video/exynos/exynos_mipi_dsi_common.c index 7eed957b601..85edabfdef5 100644 --- a/drivers/video/exynos/exynos_mipi_dsi_common.c +++ b/drivers/video/exynos/exynos_mipi_dsi_common.c @@ -220,7 +220,7 @@ int exynos_mipi_dsi_wr_data(struct mipi_dsim_device *dsim, unsigned int data_id, case MIPI_DSI_DCS_LONG_WRITE: { unsigned int size, payload = 0; - INIT_COMPLETION(dsim_wr_comp); + reinit_completion(&dsim_wr_comp); size = data_size * 4; @@ -356,7 +356,7 @@ int exynos_mipi_dsi_rd_data(struct mipi_dsim_device *dsim, unsigned int data_id, msleep(20); mutex_lock(&dsim->lock); - INIT_COMPLETION(dsim_rd_comp); + reinit_completion(&dsim_rd_comp); exynos_mipi_dsi_rd_tx_header(dsim, MIPI_DSI_SET_MAXIMUM_RETURN_PACKET_SIZE, req_size); diff --git a/drivers/video/omap2/displays-new/encoder-tpd12s015.c b/drivers/video/omap2/displays-new/encoder-tpd12s015.c index 798ef200b05..d5c936cb217 100644 --- a/drivers/video/omap2/displays-new/encoder-tpd12s015.c +++ b/drivers/video/omap2/displays-new/encoder-tpd12s015.c @@ -69,7 +69,7 @@ static int tpd_connect(struct omap_dss_device *dssdev, dst->src = dssdev; dssdev->dst = dst; - INIT_COMPLETION(ddata->hpd_completion); + reinit_completion(&ddata->hpd_completion); gpio_set_value_cansleep(ddata->ct_cp_hpd_gpio, 1); /* DC-DC converter needs at max 300us to get to 90% of 5V */ diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1f572c00a1b..c444654fc33 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -275,9 +275,8 @@ static inline s64 towards_target(struct virtio_balloon *vb) __le32 v; s64 target; - vb->vdev->config->get(vb->vdev, - offsetof(struct virtio_balloon_config, num_pages), - &v, sizeof(v)); + virtio_cread(vb->vdev, struct virtio_balloon_config, num_pages, &v); + target = le32_to_cpu(v); return target - vb->num_pages; } @@ -286,9 +285,8 @@ static void update_balloon_size(struct virtio_balloon *vb) { __le32 actual = cpu_to_le32(vb->num_pages); - vb->vdev->config->set(vb->vdev, - offsetof(struct virtio_balloon_config, actual), - &actual, sizeof(actual)); + virtio_cwrite(vb->vdev, struct virtio_balloon_config, num_pages, + &actual); } static int balloon(void *_vballoon) @@ -513,7 +511,7 @@ static void virtballoon_remove(struct virtio_device *vdev) kfree(vb); } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int virtballoon_freeze(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; @@ -556,7 +554,7 @@ static struct virtio_driver virtio_balloon_driver = { .probe = virtballoon_probe, .remove = virtballoon_remove, .config_changed = virtballoon_changed, -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP .freeze = virtballoon_freeze, .restore = virtballoon_restore, #endif diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 1ba0d683101..c600ccfd692 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -219,13 +219,14 @@ static void vm_reset(struct virtio_device *vdev) /* Transport interface */ /* the notify function used when creating a virt queue */ -static void vm_notify(struct virtqueue *vq) +static bool vm_notify(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); /* We write the queue's selector into the notification register to * signal the other end */ writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); + return true; } /* Notify all virtqueues on an interrupt. */ @@ -470,7 +471,7 @@ static int virtio_mmio_probe(struct platform_device *pdev) /* Check magic value */ magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE); - if (memcmp(&magic, "virt", 4) != 0) { + if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) { dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic); return -ENODEV; } diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 98917fc872a..a37c69941d3 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -197,13 +197,14 @@ static void vp_reset(struct virtio_device *vdev) } /* the notify function used when creating a virt queue */ -static void vp_notify(struct virtqueue *vq) +static bool vp_notify(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); + return true; } /* Handle a configuration change: Tell driver if it wants to know. */ diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6b4a4db4404..28b5338fff7 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -81,7 +81,7 @@ struct vring_virtqueue u16 last_used_idx; /* How to notify other side. FIXME: commonalize hcalls! */ - void (*notify)(struct virtqueue *vq); + bool (*notify)(struct virtqueue *vq); #ifdef DEBUG /* They're supposed to lock for us. */ @@ -173,6 +173,8 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq, head = vq->free_head; vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT; vq->vring.desc[head].addr = virt_to_phys(desc); + /* kmemleak gives a false positive, as it's hidden by virt_to_phys */ + kmemleak_ignore(desc); vq->vring.desc[head].len = i * sizeof(struct vring_desc); /* Update free pointer */ @@ -428,13 +430,22 @@ EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); * @vq: the struct virtqueue * * This does not need to be serialized. + * + * Returns false if host notify failed or queue is broken, otherwise true. */ -void virtqueue_notify(struct virtqueue *_vq) +bool virtqueue_notify(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + if (unlikely(vq->broken)) + return false; + /* Prod other side to tell it about changes. */ - vq->notify(_vq); + if (!vq->notify(_vq)) { + vq->broken = true; + return false; + } + return true; } EXPORT_SYMBOL_GPL(virtqueue_notify); @@ -447,11 +458,14 @@ EXPORT_SYMBOL_GPL(virtqueue_notify); * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). + * + * Returns false if kick failed, otherwise true. */ -void virtqueue_kick(struct virtqueue *vq) +bool virtqueue_kick(struct virtqueue *vq) { if (virtqueue_kick_prepare(vq)) - virtqueue_notify(vq); + return virtqueue_notify(vq); + return true; } EXPORT_SYMBOL_GPL(virtqueue_kick); @@ -742,7 +756,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, struct virtio_device *vdev, bool weak_barriers, void *pages, - void (*notify)(struct virtqueue *), + bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) { @@ -837,4 +851,12 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); +bool virtqueue_is_broken(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->broken; +} +EXPORT_SYMBOL_GPL(virtqueue_is_broken); + MODULE_LICENSE("GPL"); diff --git a/drivers/w1/masters/w1-gpio.c b/drivers/w1/masters/w1-gpio.c index 264ad1c583a..e36b18b2817 100644 --- a/drivers/w1/masters/w1-gpio.c +++ b/drivers/w1/masters/w1-gpio.c @@ -56,7 +56,7 @@ MODULE_DEVICE_TABLE(of, w1_gpio_dt_ids); static int w1_gpio_probe_dt(struct platform_device *pdev) { - struct w1_gpio_platform_data *pdata = pdev->dev.platform_data; + struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev); struct device_node *np = pdev->dev.of_node; int gpio; @@ -92,7 +92,7 @@ static int w1_gpio_probe(struct platform_device *pdev) } } - pdata = pdev->dev.platform_data; + pdata = dev_get_platdata(&pdev->dev); if (!pdata) { dev_err(&pdev->dev, "No configuration data\n"); @@ -154,7 +154,7 @@ static int w1_gpio_probe(struct platform_device *pdev) static int w1_gpio_remove(struct platform_device *pdev) { struct w1_bus_master *master = platform_get_drvdata(pdev); - struct w1_gpio_platform_data *pdata = pdev->dev.platform_data; + struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev); if (pdata->enable_external_pullup) pdata->enable_external_pullup(0); @@ -171,7 +171,7 @@ static int w1_gpio_remove(struct platform_device *pdev) static int w1_gpio_suspend(struct platform_device *pdev, pm_message_t state) { - struct w1_gpio_platform_data *pdata = pdev->dev.platform_data; + struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev); if (pdata->enable_external_pullup) pdata->enable_external_pullup(0); @@ -181,7 +181,7 @@ static int w1_gpio_suspend(struct platform_device *pdev, pm_message_t state) static int w1_gpio_resume(struct platform_device *pdev) { - struct w1_gpio_platform_data *pdata = pdev->dev.platform_data; + struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev); if (pdata->enable_external_pullup) pdata->enable_external_pullup(1); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 23eae5cb69c..c794ea18214 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -140,7 +140,6 @@ config XEN_GRANT_DEV_ALLOC config SWIOTLB_XEN def_bool y - depends on PCI && X86 select SWIOTLB config XEN_TMEM diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index b232908a619..55ea73f7c70 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -596,7 +596,7 @@ static void __init balloon_add_region(unsigned long start_pfn, } } -static int __cpuinit balloon_cpu_notify(struct notifier_block *self, +static int balloon_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (long)hcpu; @@ -616,7 +616,7 @@ static int __cpuinit balloon_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block balloon_cpu_notifier __cpuinitdata = { +static struct notifier_block balloon_cpu_notifier = { .notifier_call = balloon_cpu_notify, }; @@ -641,7 +641,7 @@ static int __init balloon_init(void) balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) - : max_pfn; + : get_num_physpages(); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 8b3a69a06c3..5de2063e16d 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -305,7 +305,7 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) if (rc < 0) goto err; - rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, u->name, evtchn); if (rc < 0) goto err; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index c4d2298893b..62ccf5424ba 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -49,6 +49,7 @@ #include <xen/grant_table.h> #include <xen/interface/memory.h> #include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h> #include <asm/xen/hypercall.h> #include <asm/xen/interface.h> @@ -898,8 +899,16 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, &map_ops[i].status, __func__); - if (xen_feature(XENFEAT_auto_translated_physmap)) + /* this is basically a nop on x86 */ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + for (i = 0; i < count; i++) { + if (map_ops[i].status) + continue; + set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT, + map_ops[i].dev_bus_addr >> PAGE_SHIFT); + } return ret; + } if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { arch_enter_lazy_mmu_mode(); @@ -942,8 +951,14 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) + /* this is basically a nop on x86 */ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + for (i = 0; i < count; i++) { + set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } return ret; + } if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { arch_enter_lazy_mmu_mode(); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 18fff88254e..d15f6e80479 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -26,6 +26,7 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" +#include <asm/pci_x86.h> static bool __read_mostly pci_seg_supported = true; @@ -192,3 +193,49 @@ static int __init register_xen_pci_notifier(void) } arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (!xen_initial_domain()) + return 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return 0; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + /* Check whether they are in the right area. */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + struct physdev_pci_mmcfg_reserved r; + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + switch (rc) { + case 0: + case -ENOSYS: + continue; + + default: + pr_warn("Failed to report MMCONFIG reservation" + " state for %s to hypervisor" + " (%d)\n", + cfg->name, rc); + } + } + return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 99db9e1eb8b..2f3528e93cb 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) static int xen_allocate_irq(struct pci_dev *pdev) { return request_irq(pdev->irq, do_hvm_evtchn_intr, - IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + IRQF_NOBALANCING | IRQF_TRIGGER_RISING, "xen-platform-pci", pdev); } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 1b2277c311d..a224bc74b6b 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -42,12 +42,31 @@ #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h> /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, + gfp_t gfp) +{ + unsigned long dma_mask = 0; + + dma_mask = dev->coherent_dma_mask; + if (!dma_mask) + dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + + return dma_mask; +} +#endif + static char *xen_io_tlb_start, *xen_io_tlb_end; static unsigned long xen_io_tlb_nslabs; /* @@ -56,17 +75,17 @@ static unsigned long xen_io_tlb_nslabs; static u64 start_dma_addr; -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { return phys_to_machine(XPADDR(paddr)).maddr; } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { return machine_to_phys(XMADDR(baddr)).paddr; } -static dma_addr_t xen_virt_to_bus(void *address) +static inline dma_addr_t xen_virt_to_bus(void *address) { return xen_phys_to_bus(virt_to_phys(address)); } @@ -89,7 +108,7 @@ static int check_pages_physically_contiguous(unsigned long pfn, return 1; } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long pfn = PFN_DOWN(p); unsigned int offset = p & ~PAGE_MASK; @@ -126,6 +145,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { int i, rc; int dma_bits; + dma_addr_t dma_handle; + phys_addr_t p = virt_to_phys(buf); dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; @@ -135,9 +156,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) do { rc = xen_create_contiguous_region( - (unsigned long)buf + (i << IO_TLB_SHIFT), + p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), - dma_bits); + dma_bits, &dma_handle); } while (rc && dma_bits++ < max_dma_bits); if (rc) return rc; @@ -263,7 +284,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, void *ret; int order = get_order(size); u64 dma_mask = DMA_BIT_MASK(32); - unsigned long vstart; phys_addr_t phys; dma_addr_t dev_addr; @@ -278,8 +298,12 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) return ret; - vstart = __get_free_pages(flags, order); - ret = (void *)vstart; + /* On ARM this function returns an ioremap'ped virtual address for + * which virt_to_phys doesn't return the corresponding physical + * address. In fact on ARM virt_to_phys only works for kernel direct + * mapped RAM memory. Also see comment below. + */ + ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); if (!ret) return ret; @@ -287,18 +311,21 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (hwdev && hwdev->coherent_dma_mask) dma_mask = dma_alloc_coherent_mask(hwdev, flags); - phys = virt_to_phys(ret); + /* At this point dma_handle is the physical address, next we are + * going to set it to the machine address. + * Do not use virt_to_phys(ret) because on ARM it doesn't correspond + * to *dma_handle. */ + phys = *dma_handle; dev_addr = xen_phys_to_bus(phys); if (((dev_addr + size - 1 <= dma_mask)) && !range_straddles_page_boundary(phys, size)) *dma_handle = dev_addr; else { - if (xen_create_contiguous_region(vstart, order, - fls64(dma_mask)) != 0) { - free_pages(vstart, order); + if (xen_create_contiguous_region(phys, order, + fls64(dma_mask), dma_handle) != 0) { + xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); return NULL; } - *dma_handle = virt_to_machine(ret).maddr; } memset(ret, 0, size); return ret; @@ -319,13 +346,15 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (hwdev && hwdev->coherent_dma_mask) dma_mask = hwdev->coherent_dma_mask; - phys = virt_to_phys(vaddr); + /* do not use virt_to_phys because on ARM it doesn't return you the + * physical address */ + phys = xen_bus_to_phys(dev_addr); if (((dev_addr + size - 1 > dma_mask)) || range_straddles_page_boundary(phys, size)) - xen_destroy_contiguous_region((unsigned long)vaddr, order); + xen_destroy_contiguous_region(phys, order); - free_pages((unsigned long)vaddr, order); + xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -352,16 +381,25 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * buffering it. */ if (dma_capable(dev, dev_addr, size) && - !range_straddles_page_boundary(phys, size) && !swiotlb_force) + !range_straddles_page_boundary(phys, size) && !swiotlb_force) { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(dev, page, offset, size, dir, attrs); return dev_addr; + } /* * Oh well, have to allocate and map a bounce buffer. */ + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; + xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, size, dir, attrs); dev_addr = xen_phys_to_bus(map); /* @@ -384,12 +422,15 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); * whatever the device wrote there. */ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { phys_addr_t paddr = xen_bus_to_phys(dev_addr); BUG_ON(dir == DMA_NONE); + xen_dma_unmap_page(hwdev, paddr, size, dir, attrs); + /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); @@ -412,7 +453,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - xen_unmap_single(hwdev, dev_addr, size, dir); + xen_unmap_single(hwdev, dev_addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); @@ -435,11 +476,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); + if (target == SYNC_FOR_CPU) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); + /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { + if (is_xen_swiotlb_buffer(dev_addr)) swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; - } + + if (target == SYNC_FOR_DEVICE) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); if (dir != DMA_FROM_DEVICE) return; @@ -502,16 +547,26 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, sg->length, dir); if (map == SWIOTLB_MAP_ERROR) { + dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); sg_dma_len(sgl) = 0; - return DMA_ERROR_CODE; + return 0; } sg->dma_address = xen_phys_to_bus(map); - } else + } else { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), + paddr & ~PAGE_MASK, + sg->length, + dir, + attrs); sg->dma_address = dev_addr; + } sg_dma_len(sg) = sg->length; } return nelems; @@ -533,7 +588,7 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir); + xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); @@ -593,3 +648,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; } EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ + if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a91a6a355cc..1a44e42d602 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -14,4 +14,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o -btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ + tests/extent-buffer-tests.o tests/btrfs-tests.o \ + tests/extent-io-tests.o tests/inode-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index e15d2b0d8d3..0890c83643e 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -229,7 +229,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); - } else { + } else if (ret < 0) { cache_no_acl(inode); } } else { diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 08cc08f037a..8aec751fa46 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -262,7 +262,7 @@ static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, struct btrfs_work *work = NULL; struct list_head *cur = NULL; - if(!list_empty(prio_head)) + if (!list_empty(prio_head)) cur = prio_head->next; smp_mb(); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0552a599b28..3775947429b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -185,6 +185,9 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, { struct __prelim_ref *ref; + if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID) + return 0; + ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); if (!ref) return -ENOMEM; @@ -323,8 +326,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; while (!eb) { - if (!level) { - WARN_ON(1); + if (WARN_ON(!level)) { ret = 1; goto out; } @@ -1619,7 +1621,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 71f074e1870..ac0b39db27d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -19,6 +19,7 @@ #ifndef __BTRFS_I__ #define __BTRFS_I__ +#include <linux/hash.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" @@ -179,6 +180,25 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode) return container_of(inode, struct btrfs_inode, vfs_inode); } +static inline unsigned long btrfs_inode_hash(u64 objectid, + const struct btrfs_root *root) +{ + u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME); + +#if BITS_PER_LONG == 32 + h = (h >> 32) ^ (h & 0xffffffff); +#endif + + return (unsigned long)h; +} + +static inline void btrfs_insert_inode_hash(struct inode *inode) +{ + unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); + + __insert_inode_hash(inode, h); +} + static inline u64 btrfs_ino(struct inode *inode) { u64 ino = BTRFS_I(inode)->location.objectid; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 1c47be18724..e0aab445697 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1038,7 +1038,7 @@ leaf_item_out_of_bounce_error: disk_item_offset, sizeof(struct btrfs_item)); item_offset = btrfs_stack_item_offset(&disk_item); - item_size = btrfs_stack_item_offset(&disk_item); + item_size = btrfs_stack_item_size(&disk_item); disk_key = &disk_item.key; type = btrfs_disk_key_type(disk_key); @@ -1900,7 +1900,9 @@ again: dev_state, dev_bytenr); } - if (block->logical_bytenr != bytenr) { + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c," @@ -1910,15 +1912,14 @@ again: block->mirror_num, btrfsic_get_block_type(state, block), block->logical_bytenr); - block->logical_bytenr = bytenr; - } else if (state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) + else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); + block->logical_bytenr = bytenr; } else { if (num_pages * PAGE_CACHE_SIZE < state->datablock_size) { @@ -2463,10 +2464,8 @@ static int btrfsic_process_written_superblock( } } - if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { - WARN_ON(1); + if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) btrfsic_dump_tree(state); - } return 0; } @@ -2906,7 +2905,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, btrfsic_release_block_ctx(&block_ctx); } - if (!match) { + if (WARN_ON(!match)) { printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," " buffer->log_bytenr=%llu, submit_bio(bdev=%s," " phys_bytenr=%llu)!\n", @@ -2923,7 +2922,6 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, bytenr, block_ctx.dev->name, block_ctx.dev_bytenr, mirror_num); } - WARN_ON(1); } } diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h deleted file mode 100644 index 7c4503ef6ef..00000000000 --- a/fs/btrfs/compat.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _COMPAT_H_ -#define _COMPAT_H_ - -#define btrfs_drop_nlink(inode) drop_nlink(inode) -#define btrfs_inc_nlink(inode) inc_nlink(inode) - -#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6aad98cb343..1499b27b418 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,7 +32,6 @@ #include <linux/writeback.h> #include <linux/bit_spinlock.h> #include <linux/slab.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -360,7 +359,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - if(!bio) { + if (!bio) { kfree(cb); return -ENOMEM; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 61b5bcd57b7..316136bd6dd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -274,7 +274,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); @@ -996,7 +996,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); @@ -1285,11 +1285,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); blocksize = btrfs_level_size(root, old_root->level); old = read_tree_block(root, logical, blocksize, 0); - if (!old || !extent_buffer_uptodate(old)) { + if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", logical); - WARN_ON(1); } else { eb = btrfs_clone_extent_buffer(old); free_extent_buffer(old); @@ -2758,7 +2757,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, int level; int lowest_unlock = 1; u8 lowest_level = 0; - int prev_cmp; + int prev_cmp = -1; lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); @@ -2769,7 +2768,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, } again: - prev_cmp = -1; b = get_old_root(root, time_seq); level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; @@ -2787,6 +2785,11 @@ again: */ btrfs_unlock_up_safe(p, level + 1); + /* + * Since we can unwind eb's we want to do a real search every + * time. + */ + prev_cmp = -1; ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { @@ -3148,7 +3151,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c), + write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(c, root->fs_info->chunk_tree_uuid, @@ -3287,7 +3290,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); write_extent_buffer(split, root->fs_info->fsid, - btrfs_header_fsid(split), BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(split, root->fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); @@ -3337,8 +3340,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) if (!nr) return 0; btrfs_init_map_token(&token); - start_item = btrfs_item_nr(l, start); - end_item = btrfs_item_nr(l, end); + start_item = btrfs_item_nr(start); + end_item = btrfs_item_nr(end); data_len = btrfs_token_item_offset(l, start_item, &token) + btrfs_token_item_size(l, start_item, &token); data_len = data_len - btrfs_token_item_offset(l, end_item, &token); @@ -3406,7 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(left, i); + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] > i) @@ -3470,7 +3473,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); push_space -= btrfs_token_item_size(right, item, &token); btrfs_set_token_item_offset(right, item, push_space, &token); } @@ -3612,7 +3615,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] < i) @@ -3639,8 +3642,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, ret = 1; goto out; } - if (!empty && push_items == btrfs_header_nritems(right)) - WARN_ON(1); + WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ copy_extent_buffer(left, right, @@ -3663,7 +3665,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(left, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(left, item, &token); btrfs_set_token_item_offset(left, item, @@ -3694,7 +3696,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); push_space = push_space - btrfs_token_item_size(right, item, &token); @@ -3835,7 +3837,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_item_end_nr(l, mid); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); + struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; ioff = btrfs_token_item_offset(right, item, &token); @@ -4016,7 +4018,7 @@ again: data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (data_size && !tried_avoid_double) goto push_for_double; - split = 2 ; + split = 2; } } } @@ -4042,7 +4044,7 @@ again: btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, - btrfs_header_fsid(right), BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(right, root->fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(right), @@ -4177,7 +4179,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(path); - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); @@ -4200,7 +4202,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(leaf, slot); + new_item = btrfs_item_nr(slot); btrfs_set_item_offset(leaf, new_item, orig_offset); btrfs_set_item_size(leaf, new_item, item_size - split_offset); @@ -4339,7 +4341,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, @@ -4387,7 +4389,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, fixup_low_keys(root, path, &disk_key, 1); } - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, new_size); btrfs_mark_buffer_dirty(leaf); @@ -4441,7 +4443,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, @@ -4455,7 +4457,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, data_end = old_data; old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, old_size + data_size); btrfs_mark_buffer_dirty(leaf); @@ -4514,7 +4516,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr( i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, ioff - total_data, &token); @@ -4535,7 +4537,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, for (i = 0; i < nr; i++) { btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); + item = btrfs_item_nr(slot + i); btrfs_set_token_item_offset(leaf, item, data_end - data_size[i], &token); data_end -= data_size[i]; @@ -4730,7 +4732,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, ioff + dsize, &token); @@ -4823,14 +4825,18 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - if (key.offset > 0) + if (key.offset > 0) { key.offset--; - else if (key.type > 0) + } else if (key.type > 0) { key.type--; - else if (key.objectid > 0) + key.offset = (u64)-1; + } else if (key.objectid > 0) { key.objectid--; - else + key.type = (u8)-1; + key.offset = (u64)-1; + } else { return 1; + } btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -4866,7 +4872,6 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * was nothing in the tree that matched the search criteria. */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, struct btrfs_path *path, u64 min_trans) { @@ -4911,10 +4916,8 @@ again: * If it is too old, old, skip to the next one. */ while (slot < nritems) { - u64 blockptr; u64 gen; - blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); if (gen < min_trans) { slot++; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0506f40ede8..aea4433081d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -47,6 +47,12 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define STATIC noinline +#else +#define STATIC static noinline +#endif + #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 @@ -1580,7 +1586,6 @@ struct btrfs_fs_info { atomic_t scrubs_paused; atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; - struct rw_semaphore scrub_super_lock; int scrub_workers_refcnt; struct btrfs_workers scrub_workers; struct btrfs_workers scrub_wr_completion_workers; @@ -1724,7 +1729,9 @@ struct btrfs_root { int ref_cows; int track_dirty; int in_radix; - +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + int dummy_root; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -2461,8 +2468,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr) sizeof(struct btrfs_item) * nr; } -static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, - int nr) +static inline struct btrfs_item *btrfs_item_nr(int nr) { return (struct btrfs_item *)btrfs_item_nr_offset(nr); } @@ -2475,30 +2481,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb, static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_end(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_offset(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_size(eb, btrfs_item_nr(nr)); } static inline void btrfs_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(eb, nr); + struct btrfs_item *item = btrfs_item_nr(nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(eb, nr); + struct btrfs_item *item = btrfs_item_nr(nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } @@ -2666,7 +2672,7 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_fsid(void) { return offsetof(struct btrfs_header, fsid); } @@ -3308,7 +3314,6 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, struct btrfs_path *path, u64 min_trans); enum btrfs_compare_tree_result { @@ -3675,8 +3680,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, - int delay_iput); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, @@ -3944,9 +3948,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_root *root); -void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); -void btrfs_scrub_continue_super(struct btrfs_root *root); int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); @@ -4028,5 +4030,9 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } +/* Sanity test specific functions */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode); +#endif #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index cbd9523ad09..8d292fbae65 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -108,8 +108,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) return node; } btrfs_inode->delayed_node = node; - atomic_inc(&node->refs); /* can be accessed */ - atomic_inc(&node->refs); /* cached in the inode */ + /* can be accessed and cached in the inode */ + atomic_add(2, &node->refs); spin_unlock(&root->inode_lock); return node; } @@ -138,8 +138,8 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - atomic_inc(&node->refs); /* cached in the btrfs inode */ - atomic_inc(&node->refs); /* can be accessed */ + /* cached in the btrfs inode and can be accessed */ + atomic_add(2, &node->refs); ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) { @@ -649,14 +649,13 @@ static int btrfs_delayed_inode_reserve_metadata( goto out; ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) + if (!WARN_ON(ret)) goto out; /* * Ok this is a problem, let's just steal from the global rsv * since this really shouldn't happen that often. */ - WARN_ON(1); ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, dst_rsv, num_bytes); goto out; @@ -771,13 +770,13 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, */ btrfs_set_path_blocking(path); - keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS); + keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); if (!keys) { ret = -ENOMEM; goto out; } - data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS); + data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS); if (!data_size) { ret = -ENOMEM; goto error; @@ -1174,8 +1173,10 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, mutex_unlock(&delayed_node->mutex); path = btrfs_alloc_path(); - if (!path) + if (!path) { + btrfs_release_delayed_node(delayed_node); return -ENOMEM; + } path->leave_spinning = 1; block_rsv = trans->block_rsv; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9efb94e9585..342f9fd411e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -26,7 +26,6 @@ #include <linux/kthread.h> #include <linux/math64.h> #include <asm/div64.h> -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -38,7 +37,6 @@ #include "rcu-string.h" #include "dev-replace.h" -static u64 btrfs_get_seconds_since_1970(void); static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); static void btrfs_dev_replace_update_device_in_mapping_tree( @@ -296,13 +294,6 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -static u64 btrfs_get_seconds_since_1970(void) -{ - struct timespec t = CURRENT_TIME_SEC; - - return t.tv_sec; -} - int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args) { @@ -390,7 +381,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, * go to the tgtdev as well (refer to btrfs_map_block()). */ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; - dev_replace->time_started = btrfs_get_seconds_since_1970(); + dev_replace->time_started = get_seconds(); dev_replace->cursor_left = 0; dev_replace->committed_cursor_left = 0; dev_replace->cursor_left_last_write_of_item = 0; @@ -400,7 +391,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, -1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -470,12 +461,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0); + ret = btrfs_start_delalloc_roots(root->fs_info, 0); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, -1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -493,7 +484,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; if (scrub_ret) { @@ -650,6 +641,9 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) u64 result; int ret; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { @@ -668,7 +662,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) break; } dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; btrfs_dev_replace_unlock(dev_replace); btrfs_scrub_cancel(fs_info); @@ -703,7 +697,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; pr_info("btrfs: suspending dev_replace for unmount\n"); break; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 79e594e341c..c031ea3fd70 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -58,7 +58,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); BUG_ON(data_size > btrfs_item_size(leaf, item)); ptr += btrfs_item_size(leaf, item) - data_size; @@ -474,8 +474,10 @@ int verify_dir_item(struct btrfs_root *root, } /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ - if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) { - printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n", + if ((btrfs_dir_data_len(leaf, dir_item) + + btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { + printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n", + (unsigned)btrfs_dir_name_len(leaf, dir_item), (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62176ad8984..4c4ed0bb3da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -33,7 +33,6 @@ #include <linux/uuid.h> #include <linux/semaphore.h> #include <asm/unaligned.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -64,7 +63,6 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -477,14 +475,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) if (page != eb->pages[0]) return 0; found_start = btrfs_header_bytenr(eb); - if (found_start != start) { - WARN_ON(1); + if (WARN_ON(found_start != start || !PageUptodate(page))) return 0; - } - if (!PageUptodate(page)) { - WARN_ON(1); - return 0; - } csum_tree_block(root, eb, 0); return 0; } @@ -496,7 +488,7 @@ static int check_tree_block_fsid(struct btrfs_root *root, u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { ret = 0; @@ -1105,8 +1097,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, { struct inode *btree_inode = root->fs_info->btree_inode; struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr); return eb; } @@ -1229,14 +1220,18 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->refs, 1); root->log_transid = 0; root->last_log_commit = 0; - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping); + if (fs_info) + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); memset(&root->root_kobj, 0, sizeof(root->root_kobj)); - root->defrag_trans_start = fs_info->generation; + if (fs_info) + root->defrag_trans_start = fs_info->generation; + else + root->defrag_trans_start = 0; init_completion(&root->kobj_unregister); root->defrag_running = 0; root->root_key.objectid = objectid; @@ -1253,6 +1248,22 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) return root; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +/* Should only be used by the testing infrastructure */ +struct btrfs_root *btrfs_alloc_dummy_root(void) +{ + struct btrfs_root *root; + + root = btrfs_alloc_root(NULL); + if (!root) + return ERR_PTR(-ENOMEM); + __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); + root->dummy_root = 1; + + return root; +} +#endif + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 objectid) @@ -1292,7 +1303,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_header_owner(leaf, objectid); root->node = leaf; - write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf), + write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(leaf, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(leaf), @@ -1379,7 +1390,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->node = leaf; write_extent_buffer(root->node, root->fs_info->fsid, - btrfs_header_fsid(root->node), BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; @@ -1780,6 +1791,9 @@ sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); + if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, + &root->fs_info->fs_state))) + btrfs_cleanup_transaction(root); if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && @@ -2013,50 +2027,28 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_stop_workers(&fs_info->qgroup_rescan_workers); } +static void free_root_extent_buffers(struct btrfs_root *root) +{ + if (root) { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + root->node = NULL; + root->commit_root = NULL; + } +} + /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) { - free_extent_buffer(info->tree_root->node); - free_extent_buffer(info->tree_root->commit_root); - info->tree_root->node = NULL; - info->tree_root->commit_root = NULL; - - if (info->dev_root) { - free_extent_buffer(info->dev_root->node); - free_extent_buffer(info->dev_root->commit_root); - info->dev_root->node = NULL; - info->dev_root->commit_root = NULL; - } - if (info->extent_root) { - free_extent_buffer(info->extent_root->node); - free_extent_buffer(info->extent_root->commit_root); - info->extent_root->node = NULL; - info->extent_root->commit_root = NULL; - } - if (info->csum_root) { - free_extent_buffer(info->csum_root->node); - free_extent_buffer(info->csum_root->commit_root); - info->csum_root->node = NULL; - info->csum_root->commit_root = NULL; - } - if (info->quota_root) { - free_extent_buffer(info->quota_root->node); - free_extent_buffer(info->quota_root->commit_root); - info->quota_root->node = NULL; - info->quota_root->commit_root = NULL; - } - if (info->uuid_root) { - free_extent_buffer(info->uuid_root->node); - free_extent_buffer(info->uuid_root->commit_root); - info->uuid_root->node = NULL; - info->uuid_root->commit_root = NULL; - } - if (chunk_root) { - free_extent_buffer(info->chunk_root->node); - free_extent_buffer(info->chunk_root->commit_root); - info->chunk_root->node = NULL; - info->chunk_root->commit_root = NULL; - } + free_root_extent_buffers(info->tree_root); + + free_root_extent_buffers(info->dev_root); + free_root_extent_buffers(info->extent_root); + free_root_extent_buffers(info->csum_root); + free_root_extent_buffers(info->quota_root); + free_root_extent_buffers(info->uuid_root); + if (chunk_root) + free_root_extent_buffers(info->chunk_root); } static void del_fs_roots(struct btrfs_fs_info *fs_info) @@ -2230,7 +2222,6 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); init_waitqueue_head(&fs_info->scrub_pause_wait); - init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; @@ -2272,7 +2263,7 @@ int open_ctree(struct super_block *sb, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(fs_info->btree_inode)->runtime_flags); - insert_inode_hash(fs_info->btree_inode); + btrfs_insert_inode_hash(fs_info->btree_inode); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2670,6 +2661,7 @@ retry_root_backup: btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); + btrfs_set_root_refs(&tree_root->root_item, 1); location.objectid = BTRFS_EXTENT_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; @@ -3448,10 +3440,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors) { - int ret; - - ret = write_all_supers(root, max_mirrors); - return ret; + return write_all_supers(root, max_mirrors); } /* Drop a fs root from the radix tree and free it. */ @@ -3614,12 +3603,12 @@ int close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } + del_fs_roots(fs_info); + btrfs_free_block_groups(fs_info); btrfs_stop_all_workers(fs_info); - del_fs_roots(fs_info); - free_root_pointers(fs_info, 1); iput(fs_info->btree_inode); @@ -3669,10 +3658,20 @@ int btrfs_set_buffer_uptodate(struct extent_buffer *buf) void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + struct btrfs_root *root; u64 transid = btrfs_header_generation(buf); int was_dirty; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + /* + * This is a fast path so only do this check if we have sanity tests + * enabled. Normal people shouldn't be marking dummy buffers as dirty + * outside of the sanity tests. + */ + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags))) + return; +#endif + root = BTRFS_I(buf->pages[0]->mapping->host)->root; btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " @@ -3802,7 +3801,8 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); - list_del_init(&root->ordered_root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); btrfs_destroy_ordered_extents(root); @@ -3880,24 +3880,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) -{ - struct btrfs_pending_snapshot *snapshot; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - list_splice_init(&t->pending_snapshots, &splice); - - while (!list_empty(&splice)) { - snapshot = list_entry(splice.next, - struct btrfs_pending_snapshot, - list); - snapshot->error = -ECANCELED; - list_del_init(&snapshot->list); - } -} - static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; @@ -4027,15 +4009,13 @@ again: void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { + btrfs_destroy_ordered_operations(cur_trans, root); + btrfs_destroy_delayed_refs(cur_trans, root); - btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, - cur_trans->dirty_pages.dirty_bytes); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); - btrfs_evict_pending_snapshots(cur_trans); - cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); @@ -4059,63 +4039,51 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, static int btrfs_cleanup_transaction(struct btrfs_root *root) { struct btrfs_transaction *t; - LIST_HEAD(list); mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); - list_splice_init(&root->fs_info->trans_list, &list); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); - - while (!list_empty(&list)) { - t = list_entry(list.next, struct btrfs_transaction, list); - - btrfs_destroy_ordered_operations(t, root); - - btrfs_destroy_all_ordered_extents(root->fs_info); - - btrfs_destroy_delayed_refs(t, root); - - /* - * FIXME: cleanup wait for commit - * We needn't acquire the lock here, because we are during - * the umount, there is no other task which will change it. - */ - t->state = TRANS_STATE_COMMIT_START; - smp_mb(); - if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) - wake_up(&root->fs_info->transaction_blocked_wait); - - btrfs_evict_pending_snapshots(t); - - t->state = TRANS_STATE_UNBLOCKED; - smp_mb(); - if (waitqueue_active(&root->fs_info->transaction_wait)) - wake_up(&root->fs_info->transaction_wait); - - btrfs_destroy_delayed_inodes(root); - btrfs_assert_delayed_root_empty(root); - - btrfs_destroy_all_delalloc_inodes(root->fs_info); - - btrfs_destroy_marked_extents(root, &t->dirty_pages, - EXTENT_DIRTY); - - btrfs_destroy_pinned_extent(root, - root->fs_info->pinned_extents); - - t->state = TRANS_STATE_COMPLETED; - smp_mb(); - if (waitqueue_active(&t->commit_wait)) - wake_up(&t->commit_wait); + while (!list_empty(&root->fs_info->trans_list)) { + t = list_first_entry(&root->fs_info->trans_list, + struct btrfs_transaction, list); + if (t->state >= TRANS_STATE_COMMIT_START) { + atomic_inc(&t->use_count); + spin_unlock(&root->fs_info->trans_lock); + btrfs_wait_for_commit(root, t->transid); + btrfs_put_transaction(t); + spin_lock(&root->fs_info->trans_lock); + continue; + } + if (t == root->fs_info->running_transaction) { + t->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + /* + * We wait for 0 num_writers since we don't hold a trans + * handle open currently for this transaction. + */ + wait_event(t->writer_wait, + atomic_read(&t->num_writers) == 0); + } else { + spin_unlock(&root->fs_info->trans_lock); + } + btrfs_cleanup_one_transaction(t, root); - atomic_set(&t->use_count, 0); + spin_lock(&root->fs_info->trans_lock); + if (t == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; list_del_init(&t->list); - memset(t, 0, sizeof(*t)); - kmem_cache_free(btrfs_transaction_cachep, t); - } + spin_unlock(&root->fs_info->trans_lock); + btrfs_put_transaction(t); + trace_btrfs_transaction_commit(root); + spin_lock(&root->fs_info->trans_lock); + } + spin_unlock(&root->fs_info->trans_lock); + btrfs_destroy_all_ordered_extents(root->fs_info); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + btrfs_destroy_all_delalloc_inodes(root->fs_info); mutex_unlock(&root->fs_info->transaction_kthread_mutex); return 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5ce2a7da8b1..53059df350f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -86,6 +86,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_root *btrfs_alloc_dummy_root(void); +#endif + /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 4b869160737..41422a3de8e 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" -#include "compat.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d58bef130a4..45d98d01028 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -25,7 +25,6 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> -#include "compat.h" #include "hash.h" #include "ctree.h" #include "disk-io.h" @@ -1551,9 +1550,8 @@ again: if (ret && !insert) { err = -ENOENT; goto out; - } else if (ret) { + } else if (WARN_ON(ret)) { err = -EIO; - WARN_ON(1); goto out; } @@ -1979,7 +1977,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_extent_item *item; u64 refs; int ret; - int err = 0; path = btrfs_alloc_path(); if (!path) @@ -1992,14 +1989,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path, bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret == 0) + if (ret != -EAGAIN) goto out; - if (ret != -EAGAIN) { - err = ret; - goto out; - } - leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); @@ -2021,7 +2013,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, @@ -2137,15 +2129,28 @@ again: } if (ret > 0) { if (metadata) { - btrfs_release_path(path); - metadata = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == node->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == node->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; - key.offset = node->num_bytes; - key.type = BTRFS_EXTENT_ITEM_KEY; - goto again; + key.objectid = node->bytenr; + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EIO; + goto out; } - err = -EIO; - goto out; } leaf = path->nodes[0]; @@ -2234,8 +2239,12 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, { int ret = 0; - if (trans->aborted) + if (trans->aborted) { + if (insert_reserved) + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); return 0; + } if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; @@ -2411,6 +2420,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { + /* + * Need to reset must_insert_reserved if + * there was an error so the abort stuff + * can cleanup the reserved space + * properly. + */ + if (must_insert_reserved) + locked_ref->must_insert_reserved = 1; btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); spin_lock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); @@ -3197,8 +3214,7 @@ again: if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, path, - inode); + ret = btrfs_truncate_free_space_cache(root, trans, inode); if (ret) goto out_put; } @@ -3318,10 +3334,9 @@ again: last = cache->key.objectid + cache->key.offset; err = write_one_cache_group(trans, root, path, cache); + btrfs_put_block_group(cache); if (err) /* File system offline */ goto out; - - btrfs_put_block_group(cache); } while (1) { @@ -3605,10 +3620,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) /* make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, root->sectorsize); - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { - alloc_chunk = 0; + if (btrfs_is_free_space_inode(inode)) { committed = 1; + ASSERT(current->journal_info); } data_sinfo = fs_info->data_sinfo; @@ -3636,6 +3650,16 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3681,6 +3705,9 @@ commit_trans: goto again; } + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -3989,12 +4016,26 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_all_delalloc_inodes(root->fs_info, 0); + btrfs_start_delalloc_roots(root->fs_info, 0); if (!current->journal_info) - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, -1); } } +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +{ + u64 bytes; + int nr; + + bytes = btrfs_calc_trans_metadata_size(root, 1); + nr = (int)div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM (256 * 1024) + /* * shrink metadata reservation for delalloc */ @@ -4007,24 +4048,30 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, u64 delalloc_bytes; u64 max_reclaim; long time_left; - unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; - int loops = 0; + unsigned long nr_pages; + int loops; + int items; enum btrfs_reserve_flush_enum flush; + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(root, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; - smp_mb(); delalloc_bytes = percpu_counter_sum_positive( &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_all_ordered_extents(root->fs_info); + if (wait_ordered) + btrfs_wait_ordered_roots(root->fs_info, items); return; } + loops = 0; while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; @@ -4033,9 +4080,19 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, * We need to wait for the async pages to actually start before * we do anything. */ - wait_event(root->fs_info->async_submit_wait, - !atomic_read(&root->fs_info->async_delalloc_pages)); + max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); + if (!max_reclaim) + goto skip_async; + + if (max_reclaim <= nr_pages) + max_reclaim = 0; + else + max_reclaim -= nr_pages; + wait_event(root->fs_info->async_submit_wait, + atomic_read(&root->fs_info->async_delalloc_pages) <= + (int)max_reclaim); +skip_async: if (!trans) flush = BTRFS_RESERVE_FLUSH_ALL; else @@ -4049,13 +4106,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_all_ordered_extents(root->fs_info); + btrfs_wait_ordered_roots(root->fs_info, items); } else { time_left = schedule_timeout_killable(1); if (time_left) break; } - smp_mb(); delalloc_bytes = percpu_counter_sum_positive( &root->fs_info->delalloc_bytes); } @@ -4140,16 +4196,11 @@ static int flush_space(struct btrfs_root *root, switch (state) { case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: - if (state == FLUSH_DELAYED_ITEMS_NR) { - u64 bytes = btrfs_calc_trans_metadata_size(root, 1); - - nr = (int)div64_u64(num_bytes, bytes); - if (!nr) - nr = 1; - nr *= 2; - } else { + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(root, num_bytes) * 2; + else nr = -1; - } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -4332,6 +4383,10 @@ out: !block_rsv_use_bytes(global_rsv, orig_bytes)) ret = 0; } + if (ret == -ENOSPC) + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + space_info->flags, orig_bytes, 1); if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; @@ -4986,7 +5041,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) - trace_btrfs_space_reservation(root->fs_info,"delalloc", + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); @@ -5264,6 +5319,8 @@ static int pin_down_extent(struct btrfs_root *root, set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + if (reserved) + trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -5718,9 +5775,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } extent_slot = path->slots[0]; } - } else if (ret == -ENOENT) { + } else if (WARN_ON(ret == -ENOENT)) { btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); btrfs_err(info, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, @@ -5967,6 +6023,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } out: @@ -6594,8 +6651,6 @@ again: } } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); - return ret; } @@ -6707,6 +6762,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ins->objectid, ins->offset); BUG(); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); return ret; } @@ -6731,13 +6787,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, size += sizeof(*block_info); path = btrfs_alloc_path(); - if (!path) + if (!path) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); return -ENOMEM; + } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); btrfs_free_path(path); return ret; } @@ -6779,6 +6840,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ins->objectid, ins->offset); BUG(); } + + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); return ret; } @@ -7983,7 +8046,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) spin_lock(&sinfo->lock); - for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) if (!list_empty(&sinfo->block_groups[i])) free_bytes += __btrfs_get_ro_block_group_free_space( &sinfo->block_groups[i]); @@ -8271,15 +8334,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) release_global_block_rsv(info); - while(!list_empty(&info->space_info)) { + while (!list_empty(&info->space_info)) { space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { - if (space_info->bytes_pinned > 0 || + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); + space_info->bytes_may_use > 0)) { dump_space_info(space_info, 0, 0); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 51731b76900..856bc2b2192 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,13 +13,13 @@ #include <linux/cleancache.h> #include "extent_io.h" #include "extent_map.h" -#include "compat.h" #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" #include "locking.h" #include "rcu-string.h" +#include "backref.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1597,11 +1597,10 @@ done: * * 1 is returned if we find something, 0 if nothing was in the tree */ -static noinline u64 find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, - u64 *start, u64 *end, - u64 max_bytes) +STATIC u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) { u64 delalloc_start; u64 delalloc_end; @@ -1740,10 +1739,8 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 last = 0; int found = 0; - if (search_end <= cur_start) { - WARN_ON(1); + if (WARN_ON(search_end <= cur_start)) return 0; - } spin_lock(&tree->lock); if (cur_start == 0 && bits == EXTENT_DIRTY) { @@ -3569,9 +3566,8 @@ retry: * but no sense in crashing the users box for something * we can survive anyway. */ - if (!eb) { + if (WARN_ON(!eb)) { spin_unlock(&mapping->private_lock); - WARN_ON(1); continue; } @@ -4038,7 +4034,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (offset >= last) return NULL; - while(1) { + while (1) { len = last - offset; if (len == 0) break; @@ -4062,6 +4058,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx) +{ + unsigned long cnt = *((unsigned long *)ctx); + + cnt++; + *((unsigned long *)ctx) = cnt; + + /* Now we're sure that the extent is shared. */ + if (cnt > 1) + return 1; + return 0; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4128,7 +4137,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last = found_key.offset; last_for_get_extent = last + 1; } - btrfs_free_path(path); + btrfs_release_path(path); /* * we might have some extents allocated but more delalloc past those @@ -4198,7 +4207,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { + unsigned long ref_cnt = 0; + disko = em->block_start + offset_in_extent; + + /* + * As btrfs supports shared space, this information + * can be exported to userspace tools via + * flag FIEMAP_EXTENT_SHARED. + */ + ret = iterate_inodes_from_logical( + em->block_start, + BTRFS_I(inode)->root->fs_info, + path, count_ext_ref, &ref_cnt); + if (ret < 0 && ret != -ENOENT) + goto out_free; + + if (ref_cnt > 1) + flags |= FIEMAP_EXTENT_SHARED; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; @@ -4230,6 +4256,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: + btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; @@ -4455,6 +4482,23 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) } } +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + mark_extent_buffer_accessed(eb); + return eb; + } + rcu_read_unlock(); + + return NULL; +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len) { @@ -4468,14 +4512,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, int uptodate = 1; int ret; - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + + eb = find_extent_buffer(tree, start); + if (eb) return eb; - } - rcu_read_unlock(); eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); if (!eb) @@ -4534,24 +4574,17 @@ again: spin_lock(&tree->buffer_lock); ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); if (ret == -EEXIST) { - exists = radix_tree_lookup(&tree->buffer, - start >> PAGE_CACHE_SHIFT); - if (!atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - exists = NULL; + exists = find_extent_buffer(tree, start); + if (exists) + goto free_eb; + else goto again; - } - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - mark_extent_buffer_accessed(exists); - goto free_eb; } /* add one reference for the tree */ check_buffer_tree_ref(eb); - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); /* * there is a race where release page may have @@ -4582,23 +4615,6 @@ free_eb: return exists; } -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len) -{ - struct extent_buffer *eb; - - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_extent_buffer_accessed(eb); - return eb; - } - rcu_read_unlock(); - - return NULL; -} - static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) { struct extent_buffer *eb = @@ -5062,23 +5078,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -static void move_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - if (dst_page == src_page) { - memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); - } else { - char *src_kaddr = page_address(src_page); - char *p = dst_kaddr + dst_off + len; - char *s = src_kaddr + src_off + len; - - while (len--) - *--p = *--s; - } -} - static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { unsigned long distance = (src > dst) ? src - dst : dst - src; @@ -5189,7 +5188,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), + copy_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6dbc645f1f3..19620c58f09 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -271,7 +271,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len); + u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 @@ -345,4 +345,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, int end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes); +#endif #endif diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 61adc44b780..93fba716d7f 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,10 +3,10 @@ #include <linux/rbtree.h> -#define EXTENT_MAP_LAST_BYTE (u64)-4 -#define EXTENT_MAP_HOLE (u64)-3 -#define EXTENT_MAP_INLINE (u64)-2 -#define EXTENT_MAP_DELALLOC (u64)-1 +#define EXTENT_MAP_LAST_BYTE ((u64)-4) +#define EXTENT_MAP_HOLE ((u64)-3) +#define EXTENT_MAP_INLINE ((u64)-2) +#define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 4f53159bdb9..6f384886028 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -329,6 +329,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, u64 csum_end; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + ASSERT(start == ALIGN(start, root->sectorsize) && + (end + 1) == ALIGN(end + 1, root->sectorsize)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -846,10 +849,8 @@ insert: path->leave_spinning = 0; if (ret < 0) goto fail_unlock; - if (ret != 0) { - WARN_ON(1); + if (WARN_ON(ret != 0)) goto fail_unlock; - } leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 72da4df53c9..82d0342763c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -39,7 +39,6 @@ #include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "compat.h" #include "volumes.h" static struct kmem_cache *btrfs_inode_defrag_cachep; @@ -370,7 +369,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); - while(1) { + while (1) { /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) @@ -1281,6 +1280,7 @@ again: } wait_on_page_writeback(pages[i]); } + faili = num_pages - 1; err = 0; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; @@ -1299,8 +1299,10 @@ again: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); + err = btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos); + if (err) + goto fail; goto again; } if (ordered) @@ -1809,8 +1811,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - if (full_sync) - btrfs_wait_ordered_range(inode, start, end - start + 1); + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } atomic_inc(&root->log_batch); /* @@ -1876,27 +1883,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) mutex_unlock(&inode->i_mutex); if (ret != BTRFS_NO_LOG_SYNC) { - if (ret > 0) { - /* - * If we didn't already wait for ordered extents we need - * to do that now. - */ - if (!full_sync) - btrfs_wait_ordered_range(inode, start, - end - start + 1); - ret = btrfs_commit_transaction(trans, root); - } else { + if (!ret) { ret = btrfs_sync_log(trans, root); - if (ret == 0) { + if (!ret) { ret = btrfs_end_transaction(trans, root); - } else { - if (!full_sync) - btrfs_wait_ordered_range(inode, start, - end - - start + 1); - ret = btrfs_commit_transaction(trans, root); + goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, + end - start + 1); + if (ret) + goto out; + } + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_end_transaction(trans, root); } @@ -2067,7 +2067,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page = ((offset >> PAGE_CACHE_SHIFT) == ((offset + len - 1) >> PAGE_CACHE_SHIFT)); - btrfs_wait_ordered_range(inode, offset, len); + ret = btrfs_wait_ordered_range(inode, offset, len); + if (ret) + return ret; mutex_lock(&inode->i_mutex); /* @@ -2136,8 +2138,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); - btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } path = btrfs_alloc_path(); @@ -2308,7 +2314,10 @@ static long btrfs_fallocate(struct file *file, int mode, * wait for ordered IO before we have any locks. We'll loop again * below with the locks held. */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; locked_end = alloc_end - 1; while (1) { @@ -2332,8 +2341,10 @@ static long btrfs_fallocate(struct file *file, int mode, * we can't wait on the range with the transaction * running or with the extent lock held */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; } else { if (ordered) btrfs_put_ordered_extent(ordered); @@ -2405,14 +2416,12 @@ out_reserve_fail: static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 lockstart = *offset; u64 lockend = i_size_read(inode); u64 start = *offset; - u64 orig_start = *offset; u64 len = i_size_read(inode); - u64 last_end = 0; int ret = 0; lockend = max_t(u64, root->sectorsize, lockend); @@ -2429,89 +2438,35 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, &cached_state); - /* - * Delalloc is such a pain. If we have a hole and we have pending - * delalloc for a portion of the hole we will get back a hole that - * exists for the entire range since it hasn't been actually written - * yet. So to take care of this case we need to look for an extent just - * before the position we want in case there is outstanding delalloc - * going on here. - */ - if (whence == SEEK_HOLE && start != 0) { - if (start <= root->sectorsize) - em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, - root->sectorsize, 0); - else - em = btrfs_get_extent_fiemap(inode, NULL, 0, - start - root->sectorsize, - root->sectorsize, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - last_end = em->start + em->len; - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - free_extent_map(em); - } - - while (1) { + while (start < inode->i_size) { em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); + em = NULL; break; } - if (em->block_start == EXTENT_MAP_HOLE) { - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - if (last_end <= orig_start) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - if (whence == SEEK_HOLE) { - *offset = start; - free_extent_map(em); - break; - } - } else { - if (whence == SEEK_DATA) { - if (em->block_start == EXTENT_MAP_DELALLOC) { - if (start >= inode->i_size) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - if (!test_bit(EXTENT_FLAG_PREALLOC, - &em->flags)) { - *offset = start; - free_extent_map(em); - break; - } - } - } + if (whence == SEEK_HOLE && + (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; + else if (whence == SEEK_DATA && + (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; start = em->start + em->len; - last_end = em->start + em->len; - - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - free_extent_map(em); - ret = -ENXIO; - break; - } free_extent_map(em); + em = NULL; cond_resched(); } - if (!ret) - *offset = min(*offset, inode->i_size); -out: + free_extent_map(em); + if (!ret) { + if (whence == SEEK_DATA && start >= inode->i_size) + ret = -ENXIO; + else + *offset = min_t(loff_t, start, inode->i_size); + } unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); return ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b4f9904c4c6..057be95b1e1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -218,7 +218,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path, struct inode *inode) { int ret = 0; @@ -1009,8 +1008,13 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out; - - btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto out; + } key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; @@ -2276,7 +2280,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, goto out; entry = rb_entry(node, struct btrfs_free_space, offset_index); - while(1) { + while (1) { if (entry->bytes < bytes && entry->bytes > *max_extent_size) *max_extent_size = entry->bytes; @@ -2967,19 +2971,15 @@ out: int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path) + struct btrfs_path *path, + struct inode *inode) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct inode *inode; int ret; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); if (ret) { btrfs_delalloc_release_metadata(inode, inode->i_size); @@ -2990,7 +2990,6 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, #endif } - iput(inode); return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index e737f92cf6d..0cf4977ef70 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -58,7 +58,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); @@ -76,7 +75,8 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root); int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_path *path); + struct btrfs_path *path, + struct inode *inode); void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index e0b7034d634..ec82fae0709 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -369,7 +369,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, goto out; leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); ptr += btrfs_item_size(leaf, item) - ins_len; extref = (struct btrfs_inode_extref *)ptr; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2c66ddbbe67..ab485e57b6f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -78,10 +78,8 @@ again: btrfs_transaction_in_commit(fs_info)) { leaf = path->nodes[0]; - if (btrfs_header_nritems(leaf) == 0) { - WARN_ON(1); + if (WARN_ON(btrfs_header_nritems(leaf) == 0)) break; - } /* * Save the key so we can advances forward @@ -237,7 +235,7 @@ again: start_caching(root); if (objectid <= root->cache_progress || - objectid > root->highest_objectid) + objectid >= root->highest_objectid) __btrfs_add_free_space(ctl, objectid, 1); else __btrfs_add_free_space(pinned, objectid, 1); @@ -412,8 +410,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, return 0; /* Don't save inode cache if we are deleting this root */ - if (btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) + if (btrfs_root_refs(&root->root_item) == 0) return 0; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) @@ -467,7 +464,7 @@ again: } if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + ret = btrfs_truncate_free_space_cache(root, trans, inode); if (ret) { if (ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); @@ -504,7 +501,7 @@ again: } btrfs_free_reserved_data_space(inode, prealloc); - ret = btrfs_write_out_ino_cache(root, trans, path); + ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 51e3afa7835..da8d2f696ac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,7 +43,6 @@ #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -844,7 +843,10 @@ static noinline int cow_file_range(struct inode *inode, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; - BUG_ON(btrfs_is_free_space_inode(inode)); + if (btrfs_is_free_space_inode(inode)) { + WARN_ON_ONCE(1); + return -EINVAL; + } num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); @@ -1178,10 +1180,8 @@ static noinline int run_delalloc_nocow(struct inode *inode, while (1) { ret = btrfs_lookup_file_extent(trans, root, path, ino, cur_offset, 0); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + if (ret < 0) goto error; - } if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1195,10 +1195,8 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + if (ret < 0) goto error; - } if (ret > 0) break; leaf = path->nodes[0]; @@ -1289,10 +1287,8 @@ out_check: ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret) goto error; - } cow_start = (u64)-1; } @@ -1339,10 +1335,8 @@ out_check: BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret) goto error; - } } extent_clear_unlock_delalloc(inode, cur_offset, @@ -1364,10 +1358,8 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret) goto error; - } } error: @@ -1551,7 +1543,13 @@ static void btrfs_clear_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } - if (*bits & EXTENT_DO_ACCOUNTING) + /* + * We don't reserve metadata space for space cache inodes so we + * don't need to call dellalloc_release_metadata if there is an + * error. + */ + if (*bits & EXTENT_DO_ACCOUNTING && + root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID @@ -2041,10 +2039,8 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - WARN_ON(1); + if (WARN_ON(ret < 0)) return ret; - } ret = 0; while (1) { @@ -2367,10 +2363,23 @@ out_unlock: return ret; } +static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) +{ + struct old_sa_defrag_extent *old, *tmp; + + if (!new) + return; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } + kfree(new); +} + static void relink_file_extents(struct new_sa_defrag_extent *new) { struct btrfs_path *path; - struct old_sa_defrag_extent *old, *tmp; struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; struct inode *inode; @@ -2413,16 +2422,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) kfree(prev); btrfs_free_path(path); - - list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); - kfree(old); - } out: + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); wake_up(&root->fs_info->transaction_wait); - - kfree(new); } static struct new_sa_defrag_extent * @@ -2432,7 +2436,7 @@ record_old_file_extents(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct btrfs_key key; - struct old_sa_defrag_extent *old, *tmp; + struct old_sa_defrag_extent *old; struct new_sa_defrag_extent *new; int ret; @@ -2480,7 +2484,7 @@ record_old_file_extents(struct inode *inode, if (slot >= btrfs_header_nritems(l)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out_free_list; + goto out_free_path; else if (ret > 0) break; continue; @@ -2509,7 +2513,7 @@ record_old_file_extents(struct inode *inode, old = kmalloc(sizeof(*old), GFP_NOFS); if (!old) - goto out_free_list; + goto out_free_path; offset = max(new->file_pos, key.offset); end = min(new->file_pos + new->len, key.offset + num_bytes); @@ -2531,15 +2535,10 @@ next: return new; -out_free_list: - list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); - kfree(old); - } out_free_path: btrfs_free_path(path); out_kfree: - kfree(new); + free_sa_defrag_extent(new); return NULL; } @@ -2710,8 +2709,14 @@ out: btrfs_remove_ordered_extent(inode, ordered_extent); /* for snapshot-aware defrag */ - if (new) - relink_file_extents(new); + if (new) { + if (ret) { + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); + } else { + relink_file_extents(new); + } + } /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -2969,6 +2974,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret) { + atomic_dec(&root->orphan_inodes); if (reserve) { clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &BTRFS_I(inode)->runtime_flags); @@ -3018,14 +3024,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, release_rsv = 1; spin_unlock(&root->orphan_lock); - if (trans && delete_item) - ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - - if (release_rsv) { - btrfs_orphan_release_metadata(inode); + if (delete_item) { atomic_dec(&root->orphan_inodes); + if (trans) + ret = btrfs_del_orphan_item(trans, root, + btrfs_ino(inode)); } + if (release_rsv) + btrfs_orphan_release_metadata(inode); + return ret; } @@ -3172,8 +3180,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { - if (!S_ISREG(inode->i_mode)) { - WARN_ON(1); + if (WARN_ON(!S_ISREG(inode->i_mode))) { iput(inode); continue; } @@ -3636,7 +3643,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int ret; ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) { - btrfs_drop_nlink(inode); + drop_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } return ret; @@ -4230,15 +4237,16 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { struct btrfs_ordered_extent *ordered; - btrfs_wait_ordered_range(inode, hole_start, - block_end - hole_start); + lock_extent_bits(io_tree, hole_start, block_end - 1, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, hole_start); + ordered = btrfs_lookup_ordered_range(inode, hole_start, + block_end - hole_start); if (!ordered) break; unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -4472,8 +4480,10 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); truncate_inode_pages(&inode->i_data, 0); - if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || - btrfs_is_free_space_inode(inode))) + if (inode->i_nlink && + ((btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_is_free_space_inode(inode))) goto no_delete; if (is_bad_inode(inode)) { @@ -4490,7 +4500,8 @@ void btrfs_evict_inode(struct inode *inode) } if (inode->i_nlink > 0) { - BUG_ON(btrfs_root_refs(&root->root_item) != 0); + BUG_ON(btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); goto no_delete; } @@ -4731,14 +4742,7 @@ static void inode_tree_del(struct inode *inode) } spin_unlock(&root->inode_lock); - /* - * Free space cache has inodes in the tree root, but the tree root has a - * root_refs of 0, so this could end up dropping the tree root as a - * snapshot, so we need the extra !root->fs_info->tree_root check to - * make sure we don't drop it. - */ - if (empty && btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) { + if (empty && btrfs_root_refs(&root->root_item) == 0) { synchronize_srcu(&root->fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); @@ -4831,10 +4835,12 @@ static struct inode *btrfs_iget_locked(struct super_block *s, { struct inode *inode; struct btrfs_iget_args args; + unsigned long hashval = btrfs_inode_hash(objectid, root); + args.ino = objectid; args.root = root; - inode = iget5_locked(s, objectid, btrfs_find_actor, + inode = iget5_locked(s, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; @@ -5048,7 +5054,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) continue; } - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) @@ -5454,7 +5460,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - insert_inode_hash(inode); + btrfs_insert_inode_hash(inode); inode_tree_add(inode); trace_btrfs_inode_new(inode); @@ -5730,7 +5736,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } - btrfs_inc_nlink(inode); + inc_nlink(inode); inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -5860,7 +5866,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, - btrfs_item_nr(leaf, path->slots[0])); + btrfs_item_nr(path->slots[0])); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; @@ -5974,7 +5980,14 @@ again: found_type = btrfs_key_type(&found_key); if (found_key.objectid != objectid || found_type != BTRFS_EXTENT_DATA_KEY) { - goto not_found; + /* + * If we backup past the first extent we want to move forward + * and see if there is an extent in front of us, otherwise we'll + * say there is a hole for our whole search range which can + * cause problems. + */ + extent_end = start; + goto next; } found_type = btrfs_file_extent_type(leaf, item); @@ -5989,7 +6002,7 @@ again: size = btrfs_file_extent_inline_len(leaf, item); extent_end = ALIGN(extent_start + size, root->sectorsize); } - +next: if (start >= extent_end) { path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -6249,7 +6262,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag /* adjust the range_start to make sure it doesn't * go backwards from the start they passed in */ - range_start = max(start,range_start); + range_start = max(start, range_start); found = found_end - range_start; if (found > 0) { @@ -7053,7 +7066,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } } else { submit_len += bvec->bv_len; - nr_pages ++; + nr_pages++; bvec++; } } @@ -7222,7 +7235,9 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, * outstanding dirty pages are on disk. */ count = iov_length(iov, nr_segs); - btrfs_wait_ordered_range(inode, offset, count); + ret = btrfs_wait_ordered_range(inode, offset, count); + if (ret) + return ret; if (rw & WRITE) { /* @@ -7563,7 +7578,10 @@ static int btrfs_truncate(struct inode *inode) u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + (u64)-1); + if (ret) + return ret; /* * Yes ladies and gentelment, this is indeed ugly. The fact is we have @@ -7787,6 +7805,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return inode; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode) +{ + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} +#endif + static void btrfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -7857,8 +7883,7 @@ int btrfs_drop_inode(struct inode *inode) return 1; /* the snap/subvol tree is on deleting */ - if (btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) + if (btrfs_root_refs(&root->root_item) == 0) return 1; else return generic_drop_inode(inode); @@ -7995,8 +8020,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ - if (!new_inode) { - WARN_ON(1); + if (WARN_ON(!new_inode)) { return ret; } } else { @@ -8144,18 +8168,24 @@ out_notrans: static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; + struct inode *inode; delalloc_work = container_of(work, struct btrfs_delalloc_work, work); - if (delalloc_work->wait) - btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); - else - filemap_flush(delalloc_work->inode->i_mapping); + inode = delalloc_work->inode; + if (delalloc_work->wait) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } if (delalloc_work->delay_iput) - btrfs_add_delayed_iput(delalloc_work->inode); + btrfs_add_delayed_iput(inode); else - iput(delalloc_work->inode); + iput(inode); complete(&delalloc_work->completion); } @@ -8276,8 +8306,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return ret; } -int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, - int delay_iput) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) { struct btrfs_root *root; struct list_head splice; @@ -8337,14 +8366,14 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, int err; int drop_inode = 0; u64 objectid; - u64 index = 0 ; + u64 index = 0; int name_len; int datasize; unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; - name_len = strlen(symname) + 1; + name_len = strlen(symname); if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; @@ -8432,7 +8461,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_symlink_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode_set_bytes(inode, name_len); - btrfs_i_size_write(inode, name_len - 1); + btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); if (err) drop_inode = 1; @@ -8491,6 +8520,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset); btrfs_abort_transaction(trans, root, ret); if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9d46f60cb94..1d04b5559e6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -44,7 +44,6 @@ #include <linux/uuid.h> #include <linux/btrfs.h> #include <linux/uaccess.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -369,9 +368,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) int btrfs_is_empty_uuid(u8 *uuid) { - static char empty_uuid[BTRFS_UUID_SIZE] = {0}; - - return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE); + BUILD_BUG_ON(BTRFS_UUID_SIZE > PAGE_SIZE); + return !memcmp(uuid, empty_zero_page, BTRFS_UUID_SIZE); } static noinline int create_subvol(struct inode *dir, @@ -436,7 +434,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); - write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf), + write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(leaf), @@ -574,7 +572,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) return ret; - btrfs_wait_ordered_extents(root); + btrfs_wait_ordered_extents(root, -1); pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) @@ -688,7 +686,7 @@ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) * nfs_async_unlink(). */ -static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) { int error; @@ -842,7 +840,6 @@ static int find_new_extents(struct btrfs_root *root, { struct btrfs_path *path; struct btrfs_key min_key; - struct btrfs_key max_key; struct extent_buffer *leaf; struct btrfs_file_extent_item *extent; int type; @@ -857,15 +854,10 @@ static int find_new_extents(struct btrfs_root *root, min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; - max_key.objectid = ino; - max_key.type = (u8)-1; - max_key.offset = (u64)-1; - path->keep_locks = 1; - while(1) { - ret = btrfs_search_forward(root, &min_key, &max_key, - path, newer_than); + while (1) { + ret = btrfs_search_forward(root, &min_key, path, newer_than); if (ret != 0) goto none; if (min_key.objectid != ino) @@ -1206,7 +1198,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc(sizeof(struct page *) * max_cluster, + pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_NOFS); if (!pages) { ret = -ENOMEM; @@ -1893,7 +1885,6 @@ static noinline int search_ioctl(struct inode *inode, { struct btrfs_root *root; struct btrfs_key key; - struct btrfs_key max_key; struct btrfs_path *path; struct btrfs_ioctl_search_key *sk = &args->key; struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; @@ -1925,15 +1916,10 @@ static noinline int search_ioctl(struct inode *inode, key.type = sk->min_type; key.offset = sk->min_offset; - max_key.objectid = sk->max_objectid; - max_key.type = sk->max_type; - max_key.offset = sk->max_offset; - path->keep_locks = 1; - while(1) { - ret = btrfs_search_forward(root, &key, &max_key, path, - sk->min_transid); + while (1) { + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) ret = 0; @@ -2018,7 +2004,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - while(1) { + while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -2047,7 +2033,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, } *(ptr + len) = '/'; - read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); + read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); if (key.offset == BTRFS_FIRST_FREE_OBJECTID) break; @@ -2058,7 +2044,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, dirid = key.objectid; } memmove(name, ptr, total_len); - name[total_len]='\0'; + name[total_len] = '\0'; ret = 0; out: btrfs_free_path(path); @@ -2144,7 +2130,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, inode = dentry->d_inode; dest = BTRFS_I(inode)->root; - if (!capable(CAP_SYS_ADMIN)){ + if (!capable(CAP_SYS_ADMIN)) { /* * Regular user. Only allow this with a special mount * option, when the user has write+exec access to the @@ -2727,15 +2713,10 @@ static long btrfs_ioctl_file_extent_same(struct file *file, size = sizeof(tmp) + tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info); - same = kmalloc(size, GFP_NOFS); - if (!same) { - ret = -EFAULT; - goto out; - } + same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size); - if (copy_from_user(same, - (struct btrfs_ioctl_same_args __user *)argp, size)) { - ret = -EFAULT; + if (IS_ERR(same)) { + ret = PTR_ERR(same); goto out; } @@ -3679,9 +3660,10 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - + if (root->fs_info->sb->s_flags & MS_RDONLY) { + ret = -EROFS; + goto out; + } if (atomic_xchg( &root->fs_info->mutually_exclusive_operation_running, 1)) { @@ -3707,7 +3689,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) if (copy_to_user(arg, p, sizeof(*p))) ret = -EFAULT; - +out: kfree(p); return ret; } @@ -4557,9 +4539,15 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); - case BTRFS_IOC_SYNC: - btrfs_sync_fs(file->f_dentry->d_sb, 1); - return 0; + case BTRFS_IOC_SYNC: { + int ret; + + ret = btrfs_start_delalloc_roots(root->fs_info, 0); + if (ret) + return ret; + ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); + return ret; + } case BTRFS_IOC_START_SYNC: return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c702cb62f78..25a8f3812f1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -537,7 +537,9 @@ void btrfs_remove_ordered_extent(struct inode *inode, */ if (RB_EMPTY_ROOT(&tree->tree) && !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); } if (!root->nr_ordered_extents) { @@ -563,10 +565,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -void btrfs_wait_ordered_extents(struct btrfs_root *root) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) { struct list_head splice, works; struct btrfs_ordered_extent *ordered, *next; + int count = 0; INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); @@ -574,7 +577,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->ordered_extent_lock); list_splice_init(&root->ordered_extents, &splice); - while (!list_empty(&splice)) { + while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); list_move_tail(&ordered->root_extent_list, @@ -589,7 +592,11 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root) cond_resched(); spin_lock(&root->ordered_extent_lock); + if (nr != -1) + nr--; + count++; } + list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); list_for_each_entry_safe(ordered, next, &works, work_list) { @@ -599,18 +606,21 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root) cond_resched(); } mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return count; } -void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info) +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_root *root; struct list_head splice; + int done; INIT_LIST_HEAD(&splice); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); - while (!list_empty(&splice)) { + while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); root = btrfs_grab_fs_root(root); @@ -619,10 +629,14 @@ void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - btrfs_wait_ordered_extents(root); + done = btrfs_wait_ordered_extents(root, nr); btrfs_put_fs_root(root); spin_lock(&fs_info->ordered_root_lock); + if (nr != -1) { + nr -= done; + WARN_ON(nr < 0); + } } spin_unlock(&fs_info->ordered_root_lock); } @@ -734,8 +748,9 @@ void btrfs_start_ordered_extent(struct inode *inode, /* * Used to wait on ordered extents across a large range of bytes. */ -void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { + int ret = 0; u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; @@ -751,8 +766,9 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - + ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + if (ret) + return ret; /* * So with compression we will find and lock a dirty page and clear the * first one as dirty, setup an async extent, and immediately return @@ -768,10 +784,15 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) * right and you are wrong. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + orig_end); + if (ret) + return ret; + } + ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + if (ret) + return ret; end = orig_end; while (1) { @@ -788,11 +809,14 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) } btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + ret = -EIO; btrfs_put_ordered_extent(ordered); - if (end == 0 || end == start) + if (ret || end == 0 || end == start) break; end--; } + return ret; } /* @@ -1076,7 +1100,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * if this file hasn't been changed since the last transaction * commit, we can safely return without doing anything */ - if (last_mod < root->fs_info->last_trans_committed) + if (last_mod <= root->fs_info->last_trans_committed) return; spin_lock(&root->fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0c0b35612d7..9b0450f7ac2 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -180,7 +180,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); -void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, @@ -195,8 +195,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root); -void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0088bedc863..417053b1718 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -193,7 +193,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { - item = btrfs_item_nr(l, i); + item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); type = btrfs_key_type(&key); printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d0ecfbd9cc9..24ac21840a9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -33,7 +33,6 @@ #include <linux/raid/xor.h> #include <linux/vmalloc.h> #include <asm/div64.h> -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4a355726151..ce459a7cb16 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1383,6 +1383,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct reloc_control *rc = root->fs_info->reloc_ctl; + struct btrfs_block_rsv *rsv; int clear_rsv = 0; int ret; @@ -1396,13 +1397,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) return 0; - if (!trans->block_rsv) { + if (!trans->reloc_reserved) { + rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; clear_rsv = 1; } reloc_root = create_reloc_root(trans, root, root->root_key.objectid); if (clear_rsv) - trans->block_rsv = NULL; + trans->block_rsv = rsv; ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); @@ -1775,8 +1777,7 @@ again: new_ptr_gen = 0; } - if (new_bytenr > 0 && new_bytenr == old_bytenr) { - WARN_ON(1); + if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { ret = level; break; } @@ -2058,7 +2059,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, LIST_HEAD(inode_list); struct btrfs_key key; struct btrfs_key next_key; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; @@ -2107,18 +2108,19 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, memset(&next_key, 0, sizeof(next_key)); while (1) { - trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, BTRFS_RESERVE_FLUSH_ALL); if (ret) { - BUG_ON(ret != -EAGAIN); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - continue; + err = ret; + goto out; } + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + goto out; + } + trans->block_rsv = rc->block_rsv; replaced = 0; max_level = level; @@ -2164,6 +2166,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, root_item->drop_level = level; btrfs_end_transaction_throttle(trans, root); + trans = NULL; btrfs_btree_balance_dirty(root); @@ -2192,7 +2195,8 @@ out: btrfs_update_reloc_root(trans, root); } - btrfs_end_transaction_throttle(trans, root); + if (trans) + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root); @@ -3258,7 +3262,7 @@ static int add_tree_block(struct reloc_control *rc, struct rb_node *rb_node; u32 item_size; int level = -1; - int generation; + u64 generation; eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); @@ -3407,7 +3411,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct inode *inode, u64 ino) { struct btrfs_key key; - struct btrfs_path *path; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int ret = 0; @@ -3432,22 +3435,14 @@ truncate: if (ret) goto out; - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - btrfs_free_path(path); ret = PTR_ERR(trans); goto out; } - ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + ret = btrfs_truncate_free_space_cache(root, trans, inode); - btrfs_free_path(path); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); out: @@ -3549,10 +3544,8 @@ static int find_data_references(struct reloc_control *rc, err = ret; goto out; } - if (ret > 0) { - WARN_ON(1); + if (WARN_ON(ret > 0)) goto out; - } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -3572,11 +3565,9 @@ static int find_data_references(struct reloc_control *rc, } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) { - WARN_ON(1); + if (WARN_ON(key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY)) break; - } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -4001,16 +3992,6 @@ restart: } } - ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); - if (ret < 0) { - if (ret != -ENOSPC) { - err = ret; - WARN_ON(1); - break; - } - rc->commit_transaction = 1; - } - if (rc->commit_transaction) { rc->commit_transaction = 0; ret = btrfs_commit_transaction(trans, rc->extent_root); @@ -4241,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_all_delalloc_inodes(fs_info, 0); + ret = btrfs_start_delalloc_roots(fs_info, 0); if (ret < 0) { err = ret; goto out; } - btrfs_wait_all_ordered_extents(fs_info); + btrfs_wait_ordered_roots(fs_info, -1); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4264,7 +4245,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(rc->data_inode, 0, + (u64)-1); + if (ret) { + err = ret; + goto out; + } invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; @@ -4481,6 +4467,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 disk_bytenr; + u64 new_bytenr; LIST_HEAD(list); ordered = btrfs_lookup_ordered_extent(inode, file_pos); @@ -4492,13 +4479,24 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) if (ret) goto out; - disk_bytenr = ordered->start; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); - sums->bytenr = disk_bytenr; - disk_bytenr += sums->len; + /* + * We need to offset the new_bytenr based on where the csum is. + * We need to do this because we will read in entire prealloc + * extents but we may have written to say the middle of the + * prealloc extent, so we need to make sure the csum goes with + * the right disk offset. + * + * We can do this because the data reloc inode refers strictly + * to the on disk bytes, so we don't have to worry about + * disk_len vs real len like with real inodes since it's all + * disk length. + */ + new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); + sums->bytenr = new_bytenr; btrfs_add_ordered_sum(inode, ordered, sums); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a18e0e23f6a..2544805544f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2717,8 +2717,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; btrfs_put_block_group(cache); if (ret) break; @@ -2732,6 +2730,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + key.offset = found_key.offset + length; btrfs_release_path(path); } @@ -2783,7 +2784,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, { int ret = 0; - mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, @@ -2813,21 +2813,17 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, } ++fs_info->scrub_workers_refcnt; out: - mutex_unlock(&fs_info->scrub_lock); - return ret; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { - mutex_lock(&fs_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) { btrfs_stop_workers(&fs_info->scrub_workers); btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); btrfs_stop_workers(&fs_info->scrub_nocow_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); - mutex_unlock(&fs_info->scrub_lock); } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -2888,23 +2884,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - ret = scrub_workers_get(fs_info, is_dev_replace); - if (ret) - return ret; mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (dev->missing && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -ENODEV; } - mutex_lock(&fs_info->scrub_lock); + mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -EIO; } @@ -2915,10 +2906,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, btrfs_dev_replace_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -EINPROGRESS; } btrfs_dev_replace_unlock(&fs_info->dev_replace); + + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return ret; + } + sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); @@ -2931,13 +2929,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (!is_dev_replace) { - down_read(&fs_info->scrub_super_lock); + /* + * by holding device list mutex, we can + * kick off writing super in log tree sync. + */ ret = scrub_supers(sctx, dev); - up_read(&fs_info->scrub_super_lock); } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end, @@ -2954,10 +2954,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->scrub_lock); dev->scrub_device = NULL; + scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); scrub_free_ctx(sctx); - scrub_workers_put(fs_info); return ret; } @@ -2987,16 +2987,6 @@ void btrfs_scrub_continue(struct btrfs_root *root) wake_up(&fs_info->scrub_pause_wait); } -void btrfs_scrub_pause_super(struct btrfs_root *root) -{ - down_write(&root->fs_info->scrub_super_lock); -} - -void btrfs_scrub_continue_super(struct btrfs_root *root) -{ - up_write(&root->fs_info->scrub_super_lock); -} - int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e46e0ed7492..6837fe87f3a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -121,7 +121,6 @@ struct send_ctx { struct list_head name_cache_list; int name_cache_size; - struct file *cur_inode_filp; char *read_buf; }; @@ -565,10 +564,8 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) { struct btrfs_cmd_header *hdr; - if (!sctx->send_buf) { - WARN_ON(1); + if (WARN_ON(!sctx->send_buf)) return -EINVAL; - } BUG_ON(sctx->send_size); @@ -791,7 +788,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, if (found_key->type == BTRFS_INODE_REF_KEY) { ptr = (unsigned long)btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); total = btrfs_item_size(eb, item); elem_size = sizeof(*iref); } else { @@ -905,7 +902,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, eb = path->nodes[0]; slot = path->slots[0]; - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); cur = 0; len = 0; @@ -2120,77 +2117,6 @@ out: } /* - * Called for regular files when sending extents data. Opens a struct file - * to read from the file. - */ -static int open_cur_inode_file(struct send_ctx *sctx) -{ - int ret = 0; - struct btrfs_key key; - struct path path; - struct inode *inode; - struct dentry *dentry; - struct file *filp; - int new = 0; - - if (sctx->cur_inode_filp) - goto out; - - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root, - &new); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto out; - } - - dentry = d_obtain_alias(inode); - inode = NULL; - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - goto out; - } - - path.mnt = sctx->mnt; - path.dentry = dentry; - filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred()); - dput(dentry); - dentry = NULL; - if (IS_ERR(filp)) { - ret = PTR_ERR(filp); - goto out; - } - sctx->cur_inode_filp = filp; - -out: - /* - * no xxxput required here as every vfs op - * does it by itself on failure - */ - return ret; -} - -/* - * Closes the struct file that was created in open_cur_inode_file - */ -static int close_cur_inode_file(struct send_ctx *sctx) -{ - int ret = 0; - - if (!sctx->cur_inode_filp) - goto out; - - ret = filp_close(sctx->cur_inode_filp, NULL); - sctx->cur_inode_filp = NULL; - -out: - return ret; -} - -/* * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace */ static int send_subvol_begin(struct send_ctx *sctx) @@ -3622,6 +3548,72 @@ out: return ret; } +static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct page *page; + char *addr; + struct btrfs_key key; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t last_index; + unsigned pg_offset = offset & ~PAGE_CACHE_MASK; + ssize_t ret = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (offset + len > i_size_read(inode)) { + if (offset > i_size_read(inode)) + len = 0; + else + len = offset - i_size_read(inode); + } + if (len == 0) + goto out; + + last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + while (index <= last_index) { + unsigned cur_len = min_t(unsigned, len, + PAGE_CACHE_SIZE - pg_offset); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + break; + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + + addr = kmap(page); + memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + kunmap(page); + unlock_page(page); + page_cache_release(page); + index++; + pg_offset = 0; + len -= cur_len; + ret += cur_len; + } +out: + iput(inode); + return ret; +} + /* * Read some bytes from the current inode/file and send a write command to * user space. @@ -3630,35 +3622,20 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) { int ret = 0; struct fs_path *p; - loff_t pos = offset; - int num_read = 0; - mm_segment_t old_fs; + ssize_t num_read = 0; p = fs_path_alloc(); if (!p) return -ENOMEM; - /* - * vfs normally only accepts user space buffers for security reasons. - * we only read from the file and also only provide the read_buf buffer - * to vfs. As this buffer does not come from a user space call, it's - * ok to temporary allow kernel space buffers. - */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); - ret = open_cur_inode_file(sctx); - if (ret < 0) - goto out; - - ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); - if (ret < 0) - goto out; - num_read = ret; - if (!num_read) + num_read = fill_read_buf(sctx, offset, len); + if (num_read <= 0) { + if (num_read < 0) + ret = num_read; goto out; + } ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) @@ -3677,7 +3654,6 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); tlv_put_failure: out: fs_path_free(p); - set_fs(old_fs); if (ret < 0) return ret; return num_read; @@ -3926,16 +3902,16 @@ static int is_extent_unchanged(struct send_ctx *sctx, while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); - right_offset = btrfs_file_extent_offset(eb, ei); - right_gen = btrfs_file_extent_generation(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { ret = 0; goto out; } + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_len = btrfs_file_extent_num_bytes(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + /* * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. @@ -4222,10 +4198,6 @@ static int changed_inode(struct send_ctx *sctx, u64 left_gen = 0; u64 right_gen = 0; - ret = close_cur_inode_file(sctx); - if (ret < 0) - goto out; - sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; @@ -4686,11 +4658,6 @@ static int send_subvol(struct send_ctx *sctx) } out: - if (!ret) - ret = close_cur_inode_file(sctx); - else - close_cur_inode_file(sctx); - free_recorded_refs(sctx); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e913328d0f2..2d8ac1bf0cf 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -42,7 +42,6 @@ #include <linux/cleancache.h> #include <linux/ratelimit.h> #include <linux/btrfs.h> -#include "compat.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" @@ -921,7 +920,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_all_ordered_extents(fs_info); + btrfs_wait_ordered_roots(fs_info, -1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1330,6 +1329,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * this also happens on 'umount -rf' or on shutdown, when * the filesystem is busy. */ + + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); + sb->s_flags |= MS_RDONLY; btrfs_dev_replace_suspend_for_unmount(fs_info); @@ -1465,7 +1470,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) nr_devices = fs_info->fs_devices->open_devices; BUG_ON(!nr_devices); - devices_info = kmalloc(sizeof(*devices_info) * nr_devices, + devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return -ENOMEM; @@ -1789,7 +1794,25 @@ static void btrfs_print_info(void) static int btrfs_run_sanity_tests(void) { - return btrfs_test_free_space_cache(); + int ret; + + ret = btrfs_init_test_fs(); + if (ret) + return ret; + + ret = btrfs_test_free_space_cache(); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(); + if (ret) + goto out; + ret = btrfs_test_extent_io(); + if (ret) + goto out; + ret = btrfs_test_inodes(); +out: + btrfs_destroy_test_fs(); + return ret; } static int __init init_btrfs_fs(void) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c new file mode 100644 index 00000000000..757ef00a75a --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/magic.h> +#include "btrfs-tests.h" +#include "../ctree.h" + +static struct vfsmount *test_mnt = NULL; + +static const struct super_operations btrfs_test_super_ops = { + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_test_destroy_inode, +}; + +static struct dentry *btrfs_test_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops, + NULL, BTRFS_TEST_MAGIC); +} + +static struct file_system_type test_type = { + .name = "btrfs_test_fs", + .mount = btrfs_test_mount, + .kill_sb = kill_anon_super, +}; + +struct inode *btrfs_new_test_inode(void) +{ + return new_inode(test_mnt->mnt_sb); +} + +int btrfs_init_test_fs(void) +{ + int ret; + + ret = register_filesystem(&test_type); + if (ret) { + printk(KERN_ERR "btrfs: cannot register test file system\n"); + return ret; + } + + test_mnt = kern_mount(&test_type); + if (IS_ERR(test_mnt)) { + printk(KERN_ERR "btrfs: cannot mount test file system\n"); + unregister_filesystem(&test_type); + return ret; + } + return 0; +} + +void btrfs_destroy_test_fs(void) +{ + kern_unmount(test_mnt); + unregister_filesystem(&test_type); +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 58087762577..b353bc806ca 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,11 +24,36 @@ #define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__) int btrfs_test_free_space_cache(void); +int btrfs_test_extent_buffer_operations(void); +int btrfs_test_extent_io(void); +int btrfs_test_inodes(void); +int btrfs_init_test_fs(void); +void btrfs_destroy_test_fs(void); +struct inode *btrfs_new_test_inode(void); #else static inline int btrfs_test_free_space_cache(void) { return 0; } +static inline int btrfs_test_extent_buffer_operations(void) +{ + return 0; +} +static inline int btrfs_init_test_fs(void) +{ + return 0; +} +static inline void btrfs_destroy_test_fs(void) +{ +} +static inline int btrfs_test_extent_io(void) +{ + return 0; +} +static inline int btrfs_test_inodes(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c new file mode 100644 index 00000000000..cc286ce97d1 --- /dev/null +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/slab.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../extent_io.h" +#include "../disk-io.h" + +static int test_btrfs_split_item(void) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct extent_buffer *eb; + struct btrfs_item *item; + char *value = "mary had a little lamb"; + char *split1 = "mary had a little"; + char *split2 = " lamb"; + char *split3 = "mary"; + char *split4 = " had a little"; + char buf[32]; + struct btrfs_key key; + u32 value_len = strlen(value); + int ret = 0; + + test_msg("Running btrfs_split_item tests\n"); + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Could not allocate root\n"); + return PTR_ERR(root); + } + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Could not allocate path\n"); + kfree(root); + return -ENOMEM; + } + + path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); + if (!eb) { + test_msg("Could not allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + path->slots[0] = 0; + + key.objectid = 0; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = 0; + + setup_items_for_insert(root, path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + item = btrfs_item_nr(0); + write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), + value_len); + + key.offset = 3; + + /* + * Passing NULL trans here should be safe because we have plenty of + * space in this leaf to split the item without having to split the + * leaf. + */ + ret = btrfs_split_item(NULL, root, path, &key, 17); + if (ret) { + test_msg("Split item failed %d\n", ret); + goto out; + } + + /* + * Read the first slot, it should have the original key and contain only + * 'mary had a little' + */ + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split1)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split1)); + if (memcmp(buf, split1, strlen(split1))) { + test_msg("Data in the buffer doesn't match what it should " + "in the first split have='%.*s' want '%s'\n", + (int)strlen(split1), buf, split1); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the second split\n"); + ret = -EINVAL; + goto out; + } + + key.offset = 1; + /* Do it again so we test memmoving the other items in the leaf */ + ret = btrfs_split_item(NULL, root, path, &key, 4); + if (ret) { + test_msg("Second split item failed %d\n", ret); + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split3)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split3)); + if (memcmp(buf, split3, strlen(split3))) { + test_msg("Data in the buffer doesn't match what it should " + "in the third split"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 1) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split4)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split4)); + if (memcmp(buf, split4, strlen(split4))) { + test_msg("Data in the buffer doesn't match what it should " + "in the fourth split\n"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 2); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 2\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(2); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the last chunk\n"); + ret = -EINVAL; + goto out; + } +out: + btrfs_free_path(path); + kfree(root); + return ret; +} + +int btrfs_test_extent_buffer_operations(void) +{ + test_msg("Running extent buffer operation tests"); + return test_btrfs_split_item(); +} diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c new file mode 100644 index 00000000000..7e99c2f98dd --- /dev/null +++ b/fs/btrfs/tests/extent-io-tests.c @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/pagemap.h> +#include <linux/sched.h> +#include "btrfs-tests.h" +#include "../extent_io.h" + +#define PROCESS_UNLOCK (1 << 0) +#define PROCESS_RELEASE (1 << 1) +#define PROCESS_TEST_LOCKED (1 << 2) + +static noinline int process_page_range(struct inode *inode, u64 start, u64 end, + unsigned long flags) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int count = 0; + int loops = 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (flags & PROCESS_TEST_LOCKED && + !PageLocked(pages[i])) + count++; + if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) + unlock_page(pages[i]); + page_cache_release(pages[i]); + if (flags & PROCESS_RELEASE) + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + loops++; + if (loops > 100000) { + printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret); + break; + } + } + return count; +} + +static int test_find_delalloc(void) +{ + struct inode *inode; + struct extent_io_tree tmp; + struct page *page; + struct page *locked_page = NULL; + unsigned long index = 0; + u64 total_dirty = 256 * 1024 * 1024; + u64 max_bytes = 128 * 1024 * 1024; + u64 start, end, test_start; + u64 found; + int ret = -EINVAL; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Failed to allocate test inode\n"); + return -ENOMEM; + } + + extent_io_tree_init(&tmp, &inode->i_data); + + /* + * First go through and create and mark all of our pages dirty, we pin + * everything to make sure our pages don't get evicted and screw up our + * test. + */ + for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + test_msg("Failed to allocate test page\n"); + ret = -ENOMEM; + goto out; + } + SetPageDirty(page); + if (index) { + unlock_page(page); + } else { + page_cache_get(page); + locked_page = page; + } + } + + /* Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + start = 0; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Should have found at least one delalloc\n"); + goto out_bits; + } + if (start != 0 || end != 4095) { + test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", + start, end); + goto out_bits; + } + unlock_extent(&tmp, start, end); + unlock_page(locked_page); + page_cache_release(locked_page); + + /* + * Test this scenario + * + * |--- delalloc ---| + * |--- search ---| + */ + test_start = 64 * 1024 * 1024; + locked_page = find_lock_page(inode->i_mapping, + test_start >> PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Couldn't find the locked page\n"); + goto out_bits; + } + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Couldn't find delalloc in our range\n"); + goto out_bits; + } + if (start != test_start || end != max_bytes - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu, end " + "%Lu\n", test_start, max_bytes - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("There were unlocked pages in the range\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + /* locked_page was unlocked above */ + page_cache_release(locked_page); + + /* + * Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + test_start = max_bytes + 4096; + locked_page = find_lock_page(inode->i_mapping, test_start >> + PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Could'nt find the locked page\n"); + goto out_bits; + } + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (found) { + test_msg("Found range when we shouldn't have\n"); + goto out_bits; + } + if (end != (u64)-1) { + test_msg("Did not return the proper end offset\n"); + goto out_bits; + } + + /* + * Test this scenario + * [------- delalloc -------| + * [max_bytes]|-- search--| + * + * We are re-using our test_start from above since it works out well. + */ + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start || end != total_dirty - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, total_dirty - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + + /* + * Now to test where we run into a page that is no longer dirty in the + * range we want to find. + */ + page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) + >> PAGE_CACHE_SHIFT); + if (!page) { + test_msg("Couldn't find our page\n"); + goto out_bits; + } + ClearPageDirty(page); + page_cache_release(page); + + /* We unlocked it in the previous test */ + lock_page(locked_page); + start = test_start; + end = 0; + /* + * Currently if we fail to find dirty pages in the delalloc range we + * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If + * this changes at any point in the future we will need to fix this + * tests expected behavior. + */ + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, test_start + PAGE_CACHE_SIZE - 1, start, + end); + goto out_bits; + } + if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | + PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + ret = 0; +out_bits: + clear_extent_bits(&tmp, 0, total_dirty - 1, + (unsigned long)-1, GFP_NOFS); +out: + if (locked_page) + page_cache_release(locked_page); + process_page_range(inode, 0, total_dirty - 1, + PROCESS_UNLOCK | PROCESS_RELEASE); + iput(inode); + return ret; +} + +int btrfs_test_extent_io(void) +{ + test_msg("Running find delalloc tests\n"); + return test_find_delalloc(); +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c new file mode 100644 index 00000000000..397d1f99a8e --- /dev/null +++ b/fs/btrfs/tests/inode-tests.c @@ -0,0 +1,955 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../btrfs_inode.h" +#include "../disk-io.h" +#include "../extent_io.h" +#include "../volumes.h" + +static struct btrfs_fs_info *alloc_dummy_fs_info(void) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_NOFS); + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_NOFS); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + return fs_info; +} +static void free_dummy_root(struct btrfs_root *root) +{ + if (!root) + return; + if (root->fs_info) { + kfree(root->fs_info->fs_devices); + kfree(root->fs_info); + } + if (root->node) + free_extent_buffer(root->node); + kfree(root); +} + +static void insert_extent(struct btrfs_root *root, u64 start, u64 len, + u64 ram_bytes, u64 offset, u64 disk_bytenr, + u64 disk_len, u32 type, u8 compression, int slot) +{ + struct btrfs_path path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = sizeof(struct btrfs_file_extent_item); + + if (type == BTRFS_FILE_EXTENT_INLINE) + value_len += len; + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = slot; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, 1); + btrfs_set_file_extent_type(leaf, fi, type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len); + btrfs_set_file_extent_offset(leaf, fi, offset); + btrfs_set_file_extent_num_bytes(leaf, fi, len); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); +} + +static void insert_inode_item_key(struct btrfs_root *root) +{ + struct btrfs_path path; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = 0; + + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = 0; + + key.objectid = BTRFS_INODE_ITEM_KEY; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); +} + +/* + * Build the most complicated map of extents the earth has ever seen. We want + * this so we can test all of the corner cases of btrfs_get_extent. Here is a + * diagram of how the extents will look though this may not be possible we still + * want to make sure everything acts normally (the last number is not inclusive) + * + * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] + * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * + * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] + * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * + * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] + * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * + * [81920-86016] + * [ regular ] + */ +static void setup_file_extents(struct btrfs_root *root) +{ + int slot = 0; + u64 disk_bytenr = 1 * 1024 * 1024; + u64 offset = 0; + + /* First we want a hole */ + insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 5; + + /* + * Now we want an inline extent, I don't think this is possible but hey + * why not? Also keep in mind if we have an inline extent it counts as + * the whole first page. If we were to expand it we would have to cow + * and we wouldn't have an inline extent anymore. + */ + insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + slot); + slot++; + offset = 4096; + + /* Now another hole */ + insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 4; + + /* Now for a regular extent */ + insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + disk_bytenr += 4096; + offset += 4095; + + /* + * Now for 3 extents that were split from a hole punch so we test + * offsets properly. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, + 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now for a unwritten prealloc extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + + /* + * We want to jack up disk_bytenr a little more so the em stuff doesn't + * merge our records. + */ + disk_bytenr += 8192; + + /* + * Now for a partially written prealloc extent, basically the same as + * the hole punch example above. Ram_bytes never changes when you mark + * extents written btw. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now a normal compressed extent */ + insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + /* No merges */ + disk_bytenr += 8192; + + /* Now a split compressed extent */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + disk_bytenr += 8192; + + /* Now extents that have a hole but no hole extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 16384; + disk_bytenr += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); +} + +static unsigned long prealloc_only = 0; +static unsigned long compressed_only = 0; +static unsigned long vacancy_only = 0; + +static noinline int test_btrfs_get_extent(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + u64 orig_start; + u64 disk_bytenr; + u64 offset; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + /* + * We do this since btrfs_get_extent wants to assign em->bdev to + * root->fs_info->fs_devices->latest_bdev. + */ + root->fs_info = alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(0, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + /* + * We will just free a dummy node if it's ref count is 2 so we need an + * extra ref so our searches don't accidently release our page. + */ + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + ret = -EINVAL; + + /* First with no extents */ + BTRFS_I(inode)->root = root; + em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + if (IS_ERR(em)) { + em = NULL; + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + test_msg("Vacancy flag wasn't set properly\n"); + goto out; + } + free_extent_map(em); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + + /* + * All of the magic numbers are based on the mapping setup in + * setup_file_extents, so if you change anything there you need to + * update the comment and update the expected values below. + */ + setup_file_extents(root); + + em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 5) { + test_msg("Unexpected extent wanted start 0 len 5, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_INLINE) { + test_msg("Expected an inline, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4091) { + test_msg("Unexpected extent wanted start %llu len 1, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + /* + * We don't test anything else for inline since it doesn't get set + * unless we have a page for it to write into. Maybe we should change + * this? + */ + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4) { + test_msg("Unexpected extent wanted start %llu len 4, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Regular extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4095) { + test_msg("Unexpected extent wanted start %llu len 4095, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are split extents */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + disk_bytenr += (em->start - orig_start); + if (em->block_start != disk_bytenr) { + test_msg("Wrong block start, want %llu, have %llu\n", + disk_bytenr, em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are a half written prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_HOLE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Unexpected orig offset, wanted %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start, + em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Now for the compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Split compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != disk_bytenr) { + test_msg("Block start does not match, want %llu got %llu\n", + disk_bytenr, em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* A hole between regular extents but no hole extent */ + em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole extent, got %llu\n", em->block_start); + goto out; + } + /* + * Currently we just return a length that we requested rather than the + * length of the actual hole, if this changes we'll have to change this + * test. + */ + if (em->start != offset || em->len != 12288) { + test_msg("Unexpected extent wanted start %llu len 12288, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + vacancy_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + free_dummy_root(root); + return ret; +} + +static int test_hole_first(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(0, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + BTRFS_I(inode)->root = root; + ret = -EINVAL; + + /* + * Need a blank inode item here just so we don't confuse + * btrfs_get_extent. + */ + insert_inode_item_key(root); + insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 4096) { + test_msg("Unexpected extent wanted start 0 len 4096, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only, + em->flags); + goto out; + } + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != 4096) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != 4096 || em->len != 4096) { + test_msg("Unexpected extent wanted start 4096 len 4096, got " + "start %llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, wanted 0 got %lu\n", + em->flags); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + free_dummy_root(root); + return ret; +} + +int btrfs_test_inodes(void) +{ + int ret; + + set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); + set_bit(EXTENT_FLAG_VACANCY, &vacancy_only); + set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + + test_msg("Running btrfs_get_extent tests\n"); + ret = test_btrfs_get_extent(); + if (ret) + return ret; + test_msg("Running hole first btrfs_get_extent test\n"); + return test_hole_first(); +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8c81bdc1ef9..57c16b46afb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -57,7 +57,7 @@ static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { __TRANS_JOIN_NOLOCK), }; -static void put_transaction(struct btrfs_transaction *transaction) +void btrfs_put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { @@ -332,7 +332,7 @@ static void wait_current_trans(struct btrfs_root *root) wait_event(root->fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || cur_trans->aborted); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); } else { spin_unlock(&root->fs_info->trans_lock); } @@ -353,6 +353,17 @@ static int may_wait_transaction(struct btrfs_root *root, int type) return 0; } +static inline bool need_reserve_reloc_root(struct btrfs_root *root) +{ + if (!root->fs_info->reloc_ctl || + !root->ref_cows || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + root->reloc_root) + return false; + + return true; +} + static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) @@ -360,8 +371,9 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; - int ret; u64 qgroup_reserved = 0; + bool reloc_reserved = false; + int ret; if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); @@ -390,6 +402,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, } num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + /* + * Do the reservation for the relocation root creation + */ + if (unlikely(need_reserve_reloc_root(root))) { + num_bytes += root->nodesize; + reloc_reserved = true; + } + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, num_bytes, flush); @@ -451,6 +471,7 @@ again: h->delayed_ref_elem.seq = 0; h->type = type; h->allocating_chunk = false; + h->reloc_reserved = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); @@ -466,6 +487,7 @@ again: h->transid, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; + h->reloc_reserved = reloc_reserved; } h->qgroup_reserved = qgroup_reserved; @@ -610,7 +632,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) } wait_for_commit(root, cur_trans); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); out: return ret; } @@ -735,7 +757,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); if (current->journal_info == trans) current->journal_info = NULL; @@ -744,8 +766,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); if (trans->aborted || - test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + wake_up_process(info->transaction_kthread); err = -EIO; + } assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -948,16 +972,19 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, return ret; ret = btrfs_run_dev_stats(trans, root->fs_info); - WARN_ON(ret); + if (ret) + return ret; ret = btrfs_run_dev_replace(trans, root->fs_info); - WARN_ON(ret); - + if (ret) + return ret; ret = btrfs_run_qgroups(trans, root->fs_info); - BUG_ON(ret); + if (ret) + return ret; /* run_qgroups might have added some more refs */ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; @@ -1510,7 +1537,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, if (current->journal_info == trans) current->journal_info = NULL; - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); return 0; } @@ -1552,8 +1579,10 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->trans_lock); - put_transaction(cur_trans); - put_transaction(cur_trans); + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); trace_btrfs_transaction_commit(root); @@ -1571,15 +1600,19 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - if (ret) - return ret; - /* * running the delayed items may have added new refs. account * them now so that they hinder processing of more delayed refs * as little as possible. */ - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + if (ret) { + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + return ret; + } + + ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; /* * rename don't use btrfs_join_transaction, so, once we @@ -1596,14 +1629,14 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - return btrfs_start_all_delalloc_inodes(fs_info, 1); + return btrfs_start_delalloc_roots(fs_info, 1); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_all_ordered_extents(fs_info); + btrfs_wait_ordered_roots(fs_info, -1); } int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -1669,7 +1702,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, cur_trans); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); return ret; } @@ -1686,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, prev_trans); - put_transaction(prev_trans); + btrfs_put_transaction(prev_trans); } else { spin_unlock(&root->fs_info->trans_lock); } @@ -1885,8 +1918,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, list_del_init(&cur_trans->list); spin_unlock(&root->fs_info->trans_lock); - put_transaction(cur_trans); - put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5c2af849162..7657d115067 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -92,6 +92,7 @@ struct btrfs_trans_handle { short aborted; short adding_csums; bool allocating_chunk; + bool reloc_reserved; unsigned int type; /* * this root is only needed to validate that the root passed to @@ -166,4 +167,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +void btrfs_put_transaction(struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 94e05c1f118..76928ca9774 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -37,7 +37,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int ret = 0; int wret; int level; - int is_extent = 0; int next_key_ret = 0; u64 last_ret = 0; u64 min_trans = 0; @@ -50,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } - if (root->ref_cows == 0 && !is_extent) + if (root->ref_cows == 0) goto out; if (btrfs_test_opt(root, SSD)) @@ -85,7 +84,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, path->keep_locks = 1; - ret = btrfs_search_forward(root, &key, NULL, path, min_trans); + ret = btrfs_search_forward(root, &key, path, min_trans); if (ret < 0) goto out; if (ret > 0) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 79f057c0619..744553c83fe 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -26,7 +26,6 @@ #include "locking.h" #include "print-tree.h" #include "backref.h" -#include "compat.h" #include "tree-log.h" #include "hash.h" @@ -936,7 +935,7 @@ again: parent_objectid, victim_name, victim_name_len)) { - btrfs_inc_nlink(inode); + inc_nlink(inode); btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, dir, @@ -1006,7 +1005,7 @@ again: victim_parent = read_one_inode(root, parent_objectid); if (victim_parent) { - btrfs_inc_nlink(inode); + inc_nlink(inode); btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, @@ -1113,11 +1112,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir; - struct inode *inode; + struct inode *dir = NULL; + struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - char *name; + char *name = NULL; int namelen; int ret; int search_done = 0; @@ -1150,13 +1149,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * care of the rest */ dir = read_one_inode(root, parent_objectid); - if (!dir) - return -ENOENT; + if (!dir) { + ret = -ENOENT; + goto out; + } inode = read_one_inode(root, inode_objectid); if (!inode) { - iput(dir); - return -EIO; + ret = -EIO; + goto out; } while (ref_ptr < ref_end) { @@ -1169,14 +1170,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ if (!dir) dir = read_one_inode(root, parent_objectid); - if (!dir) - return -ENOENT; + if (!dir) { + ret = -ENOENT; + goto out; + } } else { ret = ref_get_fields(eb, ref_ptr, &namelen, &name, &ref_index); } if (ret) - return ret; + goto out; /* if we already have a perfect match, we're done */ if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), @@ -1196,12 +1199,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, parent_objectid, ref_index, name, namelen, &search_done); - if (ret == 1) { - ret = 0; + if (ret) { + if (ret == 1) + ret = 0; goto out; } - if (ret) - goto out; } /* insert our name */ @@ -1215,6 +1217,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); + name = NULL; if (log_ref_ver) { iput(dir); dir = NULL; @@ -1225,6 +1228,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, root, path, eb, slot, key); out: btrfs_release_path(path); + kfree(name); iput(dir); iput(inode); return ret; @@ -1307,6 +1311,7 @@ static int count_inode_refs(struct btrfs_root *root, break; path->slots[0]--; } +process_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != ino || @@ -1327,6 +1332,10 @@ static int count_inode_refs(struct btrfs_root *root, if (key.offset == 0) break; + if (path->slots[0] > 0) { + path->slots[0]--; + goto process_slot; + } key.offset--; btrfs_release_path(path); } @@ -1480,7 +1489,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, if (!inode->i_nlink) set_nlink(inode, 1); else - btrfs_inc_nlink(inode); + inc_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; @@ -1823,7 +1832,7 @@ again: dir_key->offset, name, name_len, 0); } - if (IS_ERR_OR_NULL(log_di)) { + if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -1841,7 +1850,7 @@ again: goto out; } - btrfs_inc_nlink(inode); + inc_nlink(inode); ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) @@ -1860,6 +1869,9 @@ again: goto again; ret = 0; goto out; + } else if (IS_ERR(log_di)) { + kfree(name); + return PTR_ERR(log_di); } btrfs_release_path(log_path); kfree(name); @@ -2118,8 +2130,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; - if (btrfs_header_level(cur) != *level) - WARN_ON(1); + WARN_ON(btrfs_header_level(cur) != *level); if (path->slots[*level] >= btrfs_header_nritems(cur)) @@ -2151,11 +2162,13 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); @@ -2227,11 +2240,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, @@ -2301,11 +2316,13 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, log, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, log, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); @@ -2571,9 +2588,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - btrfs_scrub_pause_super(root); ret = write_ctree_super(trans, root->fs_info->tree_root, 1); - btrfs_scrub_continue_super(root); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; @@ -2608,13 +2623,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - if (trans) { - ret = walk_log_tree(trans, log, &wc); - - /* I don't think this can happen but just in case */ - if (ret) - btrfs_abort_transaction(trans, log, ret); - } + ret = walk_log_tree(trans, log, &wc); + /* I don't think this can happen but just in case */ + if (ret) + btrfs_abort_transaction(trans, log, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -2867,7 +2879,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; - struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src; int err = 0; @@ -2879,9 +2890,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); log = root->log_root; - max_key.objectid = ino; - max_key.offset = (u64)-1; - max_key.type = key_type; min_key.objectid = ino; min_key.type = key_type; @@ -2889,8 +2897,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->keep_locks = 1; - ret = btrfs_search_forward(root, &min_key, &max_key, - path, trans->transid); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); /* * we didn't find anything from this transaction, see if there @@ -2943,10 +2950,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, /* find the first key from this transaction again */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret != 0) { - WARN_ON(1); + if (WARN_ON(ret != 0)) goto done; - } /* * we have a block from this transaction, log every item in it @@ -3172,11 +3177,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_key key; int ret; - memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); - ret = btrfs_insert_empty_item(trans, log, path, &key, + ret = btrfs_insert_empty_item(trans, log, path, + &BTRFS_I(inode)->location, sizeof(*inode_item)); if (ret && ret != -EEXIST) return ret; @@ -3375,7 +3379,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_token_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG, &token); - if (em->block_start == 0) + if (em->block_start == EXTENT_MAP_HOLE) skip_csum = true; } @@ -3417,11 +3421,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; - if (em->compress_type) { - csum_offset = 0; - csum_len = block_len; - } - /* * First check and see if our csums are on our outstanding ordered * extents. @@ -3505,8 +3504,13 @@ unlocked: if (!mod_len || ret) return ret; - csum_offset = mod_start - em->start; - csum_len = mod_len; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } else { + csum_offset = mod_start - em->start; + csum_len = mod_len; + } /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, @@ -3719,7 +3723,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, while (1) { ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, &max_key, + ret = btrfs_search_forward(root, &min_key, path, trans->transid); if (ret != 0) break; @@ -3769,14 +3773,14 @@ next_slot: } btrfs_release_path(path); - if (min_key.offset < (u64)-1) + if (min_key.offset < (u64)-1) { min_key.offset++; - else if (min_key.type < (u8)-1) + } else if (min_key.type < max_key.type) { min_key.type++; - else if (min_key.objectid < (u64)-1) - min_key.objectid++; - else + min_key.offset = 0; + } else { break; + } } if (ins_nr) { ret = copy_items(trans, inode, dst_path, src, ins_start_slot, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index dd0dea3766f..fbda90004fe 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -260,7 +260,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; - struct btrfs_key max_key; struct btrfs_path *path; int ret = 0; struct extent_buffer *leaf; @@ -277,13 +276,10 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, key.objectid = 0; key.type = 0; key.offset = 0; - max_key.objectid = (u64)-1; - max_key.type = (u8)-1; - max_key.offset = (u64)-1; again_search_slot: path->keep_locks = 1; - ret = btrfs_search_forward(root, &key, &max_key, path, 0); + ret = btrfs_search_forward(root, &key, path, 0); if (ret) { if (ret > 0) ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 043b215769c..0db63709786 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -28,7 +28,6 @@ #include <linux/raid/pq.h> #include <linux/semaphore.h> #include <asm/div64.h> -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -666,7 +665,8 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->bdev) fs_devices->open_devices--; - if (device->writeable && !device->is_tgtdev_for_dev_replace) { + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } @@ -2041,6 +2041,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->in_fs_metadata = 1; device->is_tgtdev_for_dev_replace = 0; device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; set_blocksize(device->bdev, 4096); if (seeding_dev) { @@ -2208,6 +2209,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->in_fs_metadata = 1; device->is_tgtdev_for_dev_replace = 1; device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; set_blocksize(device->bdev, 4096); device->fs_devices = fs_info->fs_devices; list_add(&device->dev_list, &fs_info->fs_devices->devices); @@ -2550,8 +2552,7 @@ again: failed = 0; retried = true; goto again; - } else if (failed && retried) { - WARN_ON(1); + } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } error: @@ -3423,6 +3424,9 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); @@ -3488,7 +3492,7 @@ static int btrfs_uuid_scan_kthread(void *data) path->keep_locks = 1; while (1) { - ret = btrfs_search_forward(root, &key, &max_key, path, 0); + ret = btrfs_search_forward(root, &key, path, 0); if (ret) { if (ret > 0) ret = 0; @@ -4488,6 +4492,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " "%Lu-%Lu\n", logical, logical+len, em->start, em->start + em->len); + free_extent_map(em); return 1; } @@ -4668,6 +4673,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " "found %Lu-%Lu\n", logical, em->start, em->start + em->len); + free_extent_map(em); return -EINVAL; } @@ -4895,7 +4901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = map->num_stripes; max_errors = nr_parity_stripes(map); - raid_map = kmalloc(sizeof(u64) * num_stripes, + raid_map = kmalloc_array(num_stripes, sizeof(u64), GFP_NOFS); if (!raid_map) { ret = -ENOMEM; @@ -5395,10 +5401,8 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, .bi_rw = bio->bi_rw, }; - if (bio->bi_vcnt == 0) { - WARN_ON(1); + if (WARN_ON(bio->bi_vcnt == 0)) return 1; - } prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (bio_sectors(bio) > max_sectors) @@ -5631,10 +5635,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, struct btrfs_device *dev; u64 tmp; - if (!devid && !fs_info) { - WARN_ON(1); + if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - } dev = __alloc_device(); if (IS_ERR(dev)) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b72f540c8b2..8b3cd142b37 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -43,9 +43,8 @@ struct btrfs_device { /* WRITE_SYNC bios */ struct btrfs_pending_bios pending_sync_bios; - int running_pending; u64 generation; - + int running_pending; int writeable; int in_fs_metadata; int missing; @@ -53,11 +52,11 @@ struct btrfs_device { int is_tgtdev_for_dev_replace; spinlock_t io_lock; + /* the mode sent to blkdev_get */ + fmode_t mode; struct block_device *bdev; - /* the mode sent to blkdev_get */ - fmode_t mode; struct rcu_string *name; @@ -78,16 +77,21 @@ struct btrfs_device { /* optimal io width for this device */ u32 io_width; + /* type and info about this device */ + u64 type; /* minimal io size for this device */ u32 sector_size; - /* type and info about this device */ - u64 type; /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + /* for sending down flush barriers */ + int nobarriers; + struct bio *flush_bio; + struct completion flush_wait; + /* per-device scrub information */ struct scrub_ctx *scrub_device; @@ -103,10 +107,6 @@ struct btrfs_device { struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; - /* for sending down flush barriers */ - struct bio *flush_bio; - struct completion flush_wait; - int nobarriers; /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ @@ -132,7 +132,9 @@ struct btrfs_fs_devices { /* all of the devices in the FS, protected by a mutex * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code + * worrying about add/remove by the multi-device code. + * Scrubbing super can kick off supers writing by holding + * this mutex lock. */ struct mutex device_list_mutex; struct list_head devices; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 000eae2782b..2f6735dbf1a 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -392,7 +392,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, wait_for_completion(&ecr->completion); rc = ecr->rc; - INIT_COMPLETION(ecr->completion); + reinit_completion(&ecr->completion); } out: ablkcipher_request_free(req); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index efc85b1377c..3c6136f98c7 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -129,7 +129,7 @@ static int can_set_xattr(struct inode *inode, const char *name, static void hfsplus_init_header_node(struct inode *attr_file, u32 clump_size, - char *buf, size_t node_size) + char *buf, u16 node_size) { struct hfs_bnode_desc *desc; struct hfs_btree_header_rec *head; @@ -139,8 +139,9 @@ static void hfsplus_init_header_node(struct inode *attr_file, char *bmp; u32 used_nodes; u32 used_bmp_bytes; + loff_t tmp; - hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %zu\n", + hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", clump_size, node_size); /* The end of the node contains list of record offsets */ @@ -154,7 +155,9 @@ static void hfsplus_init_header_node(struct inode *attr_file, head = (struct hfs_btree_header_rec *)(buf + offset); head->node_size = cpu_to_be16(node_size); - head->node_count = cpu_to_be32(i_size_read(attr_file) / node_size); + tmp = i_size_read(attr_file); + do_div(tmp, node_size); + head->node_count = cpu_to_be32(tmp); head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1); head->clump_size = cpu_to_be32(clump_size); head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c8e729deb4f..74a7e12e10d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -244,7 +244,7 @@ static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); spin_lock(&tbl->slot_tbl_lock); if (tbl->highest_used_slotid != NFS4_NO_SLOT) { - INIT_COMPLETION(tbl->complete); + reinit_completion(&tbl->complete); spin_unlock(&tbl->slot_tbl_lock); return wait_for_completion_interruptible(&tbl->complete); } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3a44a648dae..3407b2c62b2 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1304,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) { wait_for_completion(&mw->mw_complete); /* Re-arm the completion in case we want to wait on it again */ - INIT_COMPLETION(mw->mw_complete); + reinit_completion(&mw->mw_complete); return mw->mw_status; } @@ -1355,7 +1355,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, else ret = mw->mw_status; /* Re-arm the completion in case we want to wait on it again */ - INIT_COMPLETION(mw->mw_complete); + reinit_completion(&mw->mw_complete); return ret; } diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index b701eaa482b..51942d5abce 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -29,7 +29,6 @@ static int show_console_dev(struct seq_file *m, void *v) char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; unsigned int a; - int len; dev_t dev = 0; if (con->device) { @@ -47,11 +46,10 @@ static int show_console_dev(struct seq_file *m, void *v) con_flags[a].name : ' '; flags[a] = 0; - seq_printf(m, "%s%d%n", con->name, con->index, &len); - len = 21 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-', + seq_setwidth(m, 21 - 1); + seq_printf(m, "%s%d", con->name, con->index); + seq_pad(m, ' '); + seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con->write ? 'W' : '-', con->unblank ? 'U' : '-', flags); if (dev) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c805d5b69ba..a77d2b29919 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -1,8 +1,8 @@ #include <linux/fs.h> -#include <linux/hugetlb.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> #include <linux/proc_fs.h> diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index ccfd99bd1c5..5f9bc8a746c 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -39,7 +39,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) unsigned long ino = 0; struct file *file; dev_t dev = 0; - int flags, len; + int flags; flags = region->vm_flags; file = region->vm_file; @@ -50,8 +50,9 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) ino = inode->i_ino; } + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, - "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", region->vm_start, region->vm_end, flags & VM_READ ? 'r' : '-', @@ -59,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', ((loff_t)region->vm_pgoff) << PAGE_SHIFT, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); if (file) { - len = 25 + sizeof(void *) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); + seq_pad(m, ' '); seq_path(m, &file->f_path, ""); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index abbe825d20f..fb52b548080 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -62,7 +62,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + (PTRS_PER_PTE * sizeof(pte_t) * + atomic_long_read(&mm->nr_ptes)) >> 10, swap << (PAGE_SHIFT-10)); } @@ -83,14 +84,6 @@ unsigned long task_statm(struct mm_struct *mm, return mm->total_vm; } -static void pad_len_spaces(struct seq_file *m, int len) -{ - len = 25 + sizeof(void*) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); -} - #ifdef CONFIG_NUMA /* * These functions are for numa_maps but called in generic **maps seq_file @@ -268,7 +261,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; - int len; const char *name = NULL; if (file) { @@ -286,7 +278,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) if (stack_guard_page_end(vma, end)) end -= PAGE_SIZE; - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", start, end, flags & VM_READ ? 'r' : '-', @@ -294,14 +287,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', pgoff, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); /* * Print the dentry name for named mappings, and a * special [heap] marker for the heap: */ if (file) { - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_path(m, &file->f_path, "\n"); goto done; } @@ -333,7 +326,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) name = "[stack]"; } else { /* Thread stack in /proc/PID/maps */ - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_printf(m, "[stack:%d]", tid); } } @@ -341,7 +334,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) done: if (name) { - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_puts(m, name); } seq_putc(m, '\n'); @@ -505,9 +498,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } @@ -998,13 +991,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { struct vm_area_struct *vma; struct pagemapread *pm = walk->private; + spinlock_t *ptl; pte_t *pte; int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); - if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { int pmd_flags2; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) @@ -1022,7 +1016,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (err) break; } - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return err; } @@ -1324,7 +1318,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, md = walk->private; - if (pmd_trans_huge_lock(pmd, md->vma) == 1) { + if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; @@ -1332,7 +1326,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (page) gather_stats(page, md, pte_dirty(huge_pte), HPAGE_PMD_SIZE/PAGE_SIZE); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return 0; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 56123a6f462..678455d2d68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,14 +123,6 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static void pad_len_spaces(struct seq_file *m, int len) -{ - len = 25 + sizeof(void*) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); -} - /* * display a single VMA to a sequenced file */ @@ -142,7 +134,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, unsigned long ino = 0; struct file *file; dev_t dev = 0; - int flags, len; + int flags; unsigned long long pgoff = 0; flags = vma->vm_flags; @@ -155,8 +147,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, - "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', @@ -164,16 +157,16 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', pgoff, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); if (file) { - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_path(m, &file->f_path, ""); } else if (mm) { pid_t tid = vm_is_stack(priv->task, vma, is_pid); if (tid != 0) { - pad_len_spaces(m, len); + seq_pad(m, ' '); /* * Thread stack in /proc/PID/task/TID/maps or * the main process stack. diff --git a/fs/seq_file.c b/fs/seq_file.c index a290157265e..1cd2388ca5b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -766,6 +766,21 @@ int seq_write(struct seq_file *seq, const void *data, size_t len) } EXPORT_SYMBOL(seq_write); +/** + * seq_pad - write padding spaces to buffer + * @m: seq_file identifying the buffer to which data should be written + * @c: the byte to append after padding if non-zero + */ +void seq_pad(struct seq_file *m, char c) +{ + int size = m->pad_until - m->count; + if (size > 0) + seq_printf(m, "%*s", size, ""); + if (c) + seq_putc(m, c); +} +EXPORT_SYMBOL(seq_pad); + struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 83e2c31e8b0..bc2121fa913 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -473,6 +473,7 @@ #define KERNEL_CTORS() . = ALIGN(8); \ VMLINUX_SYMBOL(__ctors_start) = .; \ *(.ctors) \ + *(.init_array) \ VMLINUX_SYMBOL(__ctors_end) = .; #else #define KERNEL_CTORS() diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h index 98e892ef6d5..a0f9280421e 100644 --- a/include/linux/cmdline-parser.h +++ b/include/linux/cmdline-parser.h @@ -8,6 +8,8 @@ #define CMDLINEPARSEH #include <linux/blkdev.h> +#include <linux/fs.h> +#include <linux/slab.h> /* partition flags */ #define PF_RDONLY 0x01 /* Device is read only */ diff --git a/include/linux/completion.h b/include/linux/completion.h index 22c33e35bcb..5d5aaae3af4 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -19,8 +19,8 @@ * * See also: complete(), wait_for_completion() (and friends _timeout, * _interruptible, _interruptible_timeout, and _killable), init_completion(), - * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and - * INIT_COMPLETION(). + * reinit_completion(), and macros DECLARE_COMPLETION(), + * DECLARE_COMPLETION_ONSTACK(). */ struct completion { unsigned int done; @@ -65,7 +65,7 @@ struct completion { /** * init_completion - Initialize a dynamically allocated completion - * @x: completion structure that is to be initialized + * @x: pointer to completion structure that is to be initialized * * This inline function will initialize a dynamically created completion * structure. @@ -76,6 +76,18 @@ static inline void init_completion(struct completion *x) init_waitqueue_head(&x->wait); } +/** + * reinit_completion - reinitialize a completion structure + * @x: pointer to completion structure that is to be reinitialized + * + * This inline function should be used to reinitialize a completion structure so it can + * be reused. This is especially important after complete_all() is used. + */ +static inline void reinit_completion(struct completion *x) +{ + x->done = 0; +} + extern void wait_for_completion(struct completion *); extern void wait_for_completion_io(struct completion *); extern int wait_for_completion_interruptible(struct completion *x); @@ -94,14 +106,4 @@ extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_all(struct completion *); -/** - * INIT_COMPLETION - reinitialize a completion structure - * @x: completion structure to be reinitialized - * - * This macro should be used to reinitialize a completion structure so it can - * be reused. This is especially important after complete_all() is used. - */ -#define INIT_COMPLETION(x) ((x).done = 0) - - #endif diff --git a/include/linux/export.h b/include/linux/export.h index 412cd509eff..3f2793d5189 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -43,7 +43,7 @@ extern struct module __this_module; /* Mark the CRC weak since genksyms apparently decides not to * generate a checksums for some symbols */ #define __CRC_SYMBOL(sym, sec) \ - extern void *__crc_##sym __attribute__((weak)); \ + extern __visible void *__crc_##sym __attribute__((weak)); \ static const unsigned long __kcrctab_##sym \ __used \ __attribute__((section("___kcrctab" sec "+" #sym), unused)) \ @@ -59,7 +59,7 @@ extern struct module __this_module; static const char __kstrtab_##sym[] \ __attribute__((section("__ksymtab_strings"), aligned(1))) \ = VMLINUX_SYMBOL_STR(sym); \ - static const struct kernel_symbol __ksymtab_##sym \ + __visible const struct kernel_symbol __ksymtab_##sym \ __used \ __attribute__((section("___ksymtab" sec "+" #sym), unused)) \ = { (unsigned long)&sym, __kstrtab_##sym } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 3935428c57c..91672e2deec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -54,7 +54,8 @@ enum page_check_address_pmd_flag { extern pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag); + enum page_check_address_pmd_flag flag, + spinlock_t **ptl); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) @@ -129,15 +130,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern int __pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma); +extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl); /* mmap_sem must be held on entry */ -static inline int pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma) +static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); if (pmd_trans_huge(*pmd)) - return __pmd_trans_huge_lock(pmd, vma); + return __pmd_trans_huge_lock(pmd, vma, ptl); else return 0; } @@ -215,8 +216,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline int pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma) +static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { return 0; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0393270466c..acd2010328f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -392,6 +392,15 @@ static inline int hugepage_migration_support(struct hstate *h) return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); } +static inline spinlock_t *huge_pte_lockptr(struct hstate *h, + struct mm_struct *mm, pte_t *pte) +{ + if (huge_page_size(h) == PMD_SIZE) + return pmd_lockptr(mm, (pmd_t *) pte); + VM_BUG_ON(huge_page_size(h) == PAGE_SIZE); + return &mm->page_table_lock; +} + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -401,6 +410,7 @@ struct hstate {}; #define hstate_sizelog(s) NULL #define hstate_vma(v) NULL #define hstate_inode(i) NULL +#define page_hstate(page) NULL #define huge_page_size(h) PAGE_SIZE #define huge_page_mask(h) PAGE_MASK #define vma_kernel_pagesize(v) PAGE_SIZE @@ -421,6 +431,22 @@ static inline pgoff_t basepage_index(struct page *page) #define dissolve_free_huge_pages(s, e) do {} while (0) #define pmd_huge_support() 0 #define hugepage_migration_support(h) 0 + +static inline spinlock_t *huge_pte_lockptr(struct hstate *h, + struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} #endif /* CONFIG_HUGETLB_PAGE */ +static inline spinlock_t *huge_pte_lock(struct hstate *h, + struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl; + + ptl = huge_pte_lockptr(h, mm, pte); + spin_lock(ptl); + return ptl; +} + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c9e831dc80b..db43b58a335 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -11,8 +11,6 @@ #include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/irqflags.h> -#include <linux/smp.h> -#include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/kref.h> #include <linux/workqueue.h> @@ -392,15 +390,6 @@ extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); -/* This is the worklist that queues up per-cpu softirq work. - * - * send_remote_sendirq() adds work to these lists, and - * the softirq handler itself dequeues from them. The queues - * are protected by disabling local cpu interrupts and they must - * only be accessed by the local cpu that they are for. - */ -DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); - DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) @@ -408,17 +397,6 @@ static inline struct task_struct *this_cpu_ksoftirqd(void) return this_cpu_read(ksoftirqd); } -/* Try to send a softirq to a remote cpu. If this cannot be done, the - * work will be queued to the local cpu. - */ -extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq); - -/* Like send_remote_softirq(), but the caller must disable local cpu interrupts - * and compute the current cpu, passed in as 'this_cpu'. - */ -extern void __send_remote_softirq(struct call_single_data *cp, int cpu, - int this_cpu, int softirq); - /* Tasklets --- multithreaded analogue of BHs. Main feature differing them of generic softirqs: tasklet diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ea319e95b4..a444c790fa7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -22,6 +22,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/types.h> +#include <trace/events/iommu.h> #define IOMMU_READ (1) #define IOMMU_WRITE (2) @@ -227,6 +228,7 @@ static inline int report_iommu_fault(struct iommu_domain *domain, ret = domain->handler(domain, dev, iova, flags, domain->handler_token); + trace_io_page_fault(dev, iova, flags); return ret; } diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 10308c6a3d1..552d51efb42 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -1,7 +1,7 @@ /* * A generic kernel FIFO implementation * - * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> + * Copyright (C) 2013 Stefani Seibold <stefani@seibold.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -67,9 +67,10 @@ struct __kfifo { union { \ struct __kfifo kfifo; \ datatype *type; \ + const datatype *const_type; \ char (*rectype)[recsize]; \ ptrtype *ptr; \ - const ptrtype *ptr_const; \ + ptrtype const *ptr_const; \ } #define __STRUCT_KFIFO(type, size, recsize, ptrtype) \ @@ -386,16 +387,12 @@ __kfifo_int_must_check_helper( \ #define kfifo_put(fifo, val) \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((val) + 1) __val = (val); \ + typeof(*__tmp->const_type) __val = (val); \ unsigned int __ret; \ - const size_t __recsize = sizeof(*__tmp->rectype); \ + size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \ - __dummy = (typeof(__val))NULL; \ - } \ if (__recsize) \ - __ret = __kfifo_in_r(__kfifo, __val, sizeof(*__val), \ + __ret = __kfifo_in_r(__kfifo, &__val, sizeof(__val), \ __recsize); \ else { \ __ret = !kfifo_is_full(__tmp); \ @@ -404,7 +401,7 @@ __kfifo_int_must_check_helper( \ ((typeof(__tmp->type))__kfifo->data) : \ (__tmp->buf) \ )[__kfifo->in & __tmp->kfifo.mask] = \ - *(typeof(__tmp->type))__val; \ + (typeof(*__tmp->type))__val; \ smp_wmb(); \ __kfifo->in++; \ } \ @@ -415,7 +412,7 @@ __kfifo_int_must_check_helper( \ /** * kfifo_get - get data from the fifo * @fifo: address of the fifo to be used - * @val: the var where to store the data to be added + * @val: address where to store the data * * This macro reads the data from the fifo. * It returns 0 if the fifo was empty. Otherwise it returns the number @@ -428,12 +425,10 @@ __kfifo_int_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((val) + 1) __val = (val); \ + typeof(__tmp->ptr) __val = (val); \ unsigned int __ret; \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) \ - __val = (typeof(__tmp->ptr))0; \ if (__recsize) \ __ret = __kfifo_out_r(__kfifo, __val, sizeof(*__val), \ __recsize); \ @@ -456,7 +451,7 @@ __kfifo_uint_must_check_helper( \ /** * kfifo_peek - get data from the fifo without removing * @fifo: address of the fifo to be used - * @val: the var where to store the data to be added + * @val: address where to store the data * * This reads the data from the fifo without removing it from the fifo. * It returns 0 if the fifo was empty. Otherwise it returns the number @@ -469,12 +464,10 @@ __kfifo_uint_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((val) + 1) __val = (val); \ + typeof(__tmp->ptr) __val = (val); \ unsigned int __ret; \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) \ - __val = (typeof(__tmp->ptr))NULL; \ if (__recsize) \ __ret = __kfifo_out_peek_r(__kfifo, __val, sizeof(*__val), \ __recsize); \ @@ -508,14 +501,10 @@ __kfifo_uint_must_check_helper( \ #define kfifo_in(fifo, buf, n) \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((buf) + 1) __buf = (buf); \ + typeof(__tmp->ptr_const) __buf = (buf); \ unsigned long __n = (n); \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \ - __dummy = (typeof(__buf))NULL; \ - } \ (__recsize) ?\ __kfifo_in_r(__kfifo, __buf, __n, __recsize) : \ __kfifo_in(__kfifo, __buf, __n); \ @@ -561,14 +550,10 @@ __kfifo_uint_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((buf) + 1) __buf = (buf); \ + typeof(__tmp->ptr) __buf = (buf); \ unsigned long __n = (n); \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr) __dummy = NULL; \ - __buf = __dummy; \ - } \ (__recsize) ?\ __kfifo_out_r(__kfifo, __buf, __n, __recsize) : \ __kfifo_out(__kfifo, __buf, __n); \ @@ -773,14 +758,10 @@ __kfifo_uint_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((buf) + 1) __buf = (buf); \ + typeof(__tmp->ptr) __buf = (buf); \ unsigned long __n = (n); \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr) __dummy __attribute__ ((unused)) = NULL; \ - __buf = __dummy; \ - } \ (__recsize) ? \ __kfifo_out_peek_r(__kfifo, __buf, __n, __recsize) : \ __kfifo_out_peek(__kfifo, __buf, __n); \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0fbbc7aa02c..9523d2ad753 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -142,7 +142,7 @@ struct kvm; struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; -extern raw_spinlock_t kvm_lock; +extern spinlock_t kvm_lock; extern struct list_head vm_list; struct kvm_io_range { @@ -189,8 +189,7 @@ struct kvm_async_pf { gva_t gva; unsigned long addr; struct kvm_arch_async_pf arch; - struct page *page; - bool done; + bool wakeup_all; }; void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); @@ -508,9 +507,10 @@ int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages); void kvm_arch_memslots_updated(struct kvm *kvm); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -671,6 +671,25 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif +#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA +void kvm_arch_register_noncoherent_dma(struct kvm *kvm); +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm); +#else +static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm) +{ +} + +static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) +{ +} + +static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) +{ + return false; +} +#endif + static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP @@ -747,9 +766,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); -/* For vcpu->arch.iommu_flags */ -#define KVM_IOMMU_CACHE_COHERENCY 0x1 - #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); @@ -789,7 +805,7 @@ static inline void kvm_guest_enter(void) /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspase from rcu point of view. In + * is very similar to exiting to userspace from rcu point of view. In * addition CPU may stay in a guest mode for quite a long time (up to * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. @@ -842,13 +858,6 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) return gfn_to_memslot(kvm, gfn)->id; } -static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) -{ - /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ - return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); -} - static inline gfn_t hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) { @@ -1066,6 +1075,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; +extern struct kvm_device_ops kvm_vfio_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/linux/llist.h b/include/linux/llist.h index 8828a78dec9..fbf10a0bc09 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -195,4 +195,6 @@ static inline struct llist_node *llist_del_all(struct llist_head *head) extern struct llist_node *llist_del_first(struct llist_head *head); +struct llist_node *llist_reverse_order(struct llist_node *head); + #endif /* LLIST_H */ diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 13dfd36a329..c8929c3832d 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -15,10 +15,15 @@ */ #include <linux/spinlock.h> +#include <generated/bounds.h> + +#define USE_CMPXCHG_LOCKREF \ + (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \ + IS_ENABLED(CONFIG_SMP) && !BLOATED_SPINLOCKS) struct lockref { union { -#ifdef CONFIG_CMPXCHG_LOCKREF +#if USE_CMPXCHG_LOCKREF aligned_u64 lock_count; #endif struct { diff --git a/include/linux/mm.h b/include/linux/mm.h index 42a35d94b82..0548eb201e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1316,32 +1316,85 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ -#if USE_SPLIT_PTLOCKS -/* - * We tuck a spinlock to guard each pagetable page into its struct page, - * at page->private, with BUILD_BUG_ON to make sure that this will not - * overflow into the next struct page (as it might with DEBUG_SPINLOCK). - * When freeing, reset page->mapping so free_pages_check won't complain. - */ -#define __pte_lockptr(page) &((page)->ptl) -#define pte_lock_init(_page) do { \ - spin_lock_init(__pte_lockptr(_page)); \ -} while (0) -#define pte_lock_deinit(page) ((page)->mapping = NULL) -#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) -#else /* !USE_SPLIT_PTLOCKS */ +#if USE_SPLIT_PTE_PTLOCKS +#if BLOATED_SPINLOCKS +void __init ptlock_cache_init(void); +extern bool ptlock_alloc(struct page *page); +extern void ptlock_free(struct page *page); + +static inline spinlock_t *ptlock_ptr(struct page *page) +{ + return page->ptl; +} +#else /* BLOATED_SPINLOCKS */ +static inline void ptlock_cache_init(void) {} +static inline bool ptlock_alloc(struct page *page) +{ + return true; +} + +static inline void ptlock_free(struct page *page) +{ +} + +static inline spinlock_t *ptlock_ptr(struct page *page) +{ + return &page->ptl; +} +#endif /* BLOATED_SPINLOCKS */ + +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return ptlock_ptr(pmd_page(*pmd)); +} + +static inline bool ptlock_init(struct page *page) +{ + /* + * prep_new_page() initialize page->private (and therefore page->ptl) + * with 0. Make sure nobody took it in use in between. + * + * It can happen if arch try to use slab for page table allocation: + * slab code uses page->slab_cache and page->first_page (for tail + * pages), which share storage with page->ptl. + */ + VM_BUG_ON(*(unsigned long *)&page->ptl); + if (!ptlock_alloc(page)) + return false; + spin_lock_init(ptlock_ptr(page)); + return true; +} + +/* Reset page->mapping so free_pages_check won't complain. */ +static inline void pte_lock_deinit(struct page *page) +{ + page->mapping = NULL; + ptlock_free(page); +} + +#else /* !USE_SPLIT_PTE_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ -#define pte_lock_init(page) do {} while (0) -#define pte_lock_deinit(page) do {} while (0) -#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) -#endif /* USE_SPLIT_PTLOCKS */ +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &mm->page_table_lock; +} +static inline void ptlock_cache_init(void) {} +static inline bool ptlock_init(struct page *page) { return true; } +static inline void pte_lock_deinit(struct page *page) {} +#endif /* USE_SPLIT_PTE_PTLOCKS */ + +static inline void pgtable_init(void) +{ + ptlock_cache_init(); + pgtable_cache_init(); +} -static inline void pgtable_page_ctor(struct page *page) +static inline bool pgtable_page_ctor(struct page *page) { - pte_lock_init(page); inc_zone_page_state(page, NR_PAGETABLE); + return ptlock_init(page); } static inline void pgtable_page_dtor(struct page *page) @@ -1378,6 +1431,52 @@ static inline void pgtable_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) +#if USE_SPLIT_PMD_PTLOCKS + +static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return ptlock_ptr(virt_to_page(pmd)); +} + +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page->pmd_huge_pte = NULL; +#endif + return ptlock_init(page); +} + +static inline void pgtable_pmd_page_dtor(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(page->pmd_huge_pte); +#endif + ptlock_free(page); +} + +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) + +#else + +static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &mm->page_table_lock; +} + +static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; } +static inline void pgtable_pmd_page_dtor(struct page *page) {} + +#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) + +#endif + +static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl = pmd_lockptr(mm, pmd); + spin_lock(ptl); + return ptl; +} + extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a3198e5aaf4..10f5a7272b8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -23,7 +23,9 @@ struct address_space; -#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) +#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) +#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ + IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) /* * Each physical page in the system has a struct page associated with @@ -63,6 +65,9 @@ struct page { * this page is only used to * free other pages. */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + pgtable_t pmd_huge_pte; /* protected by page->ptl */ +#endif }; union { @@ -141,9 +146,13 @@ struct page { * indicates order in the buddy * system if PG_buddy is set. */ -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS +#if BLOATED_SPINLOCKS + spinlock_t *ptl; +#else spinlock_t ptl; #endif +#endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ struct page *first_page; /* Compound tail pages */ }; @@ -309,14 +318,14 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTLOCKS && defined(CONFIG_MMU) +#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) #define SPLIT_RSS_COUNTING /* per-thread cached information, */ struct task_rss_stat { int events; /* for synchronization threshold */ int count[NR_MM_COUNTERS]; }; -#endif /* USE_SPLIT_PTLOCKS */ +#endif /* USE_SPLIT_PTE_PTLOCKS */ struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; @@ -339,6 +348,7 @@ struct mm_struct { pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + atomic_long_t nr_ptes; /* Page table pages */ int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ @@ -360,7 +370,6 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ unsigned long def_flags; - unsigned long nr_ptes; /* Page table pages */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; @@ -406,7 +415,7 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/include/linux/module.h b/include/linux/module.h index 05f2447f8c1..15cd6b1b211 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -367,9 +367,6 @@ struct module /* What modules do I depend on? */ struct list_head target_list; - /* Who is waiting for us to be unloaded */ - struct task_struct *waiter; - /* Destruction function. */ void (*exit)(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index f7efc860465..6f7ffa46008 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -286,6 +286,14 @@ static inline void lockup_detector_init(void) } #endif +#ifdef CONFIG_DETECT_HUNG_TASK +void reset_hung_task_detector(void); +#else +static inline void reset_hung_task_detector(void) +{ +} +#endif + /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 4e32edc8f50..52e0097f61f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -20,6 +20,7 @@ struct seq_file { size_t size; size_t from; size_t count; + size_t pad_until; loff_t index; loff_t read_pos; u64 version; @@ -79,6 +80,20 @@ static inline void seq_commit(struct seq_file *m, int num) } } +/** + * seq_setwidth - set padding width + * @m: the seq_file handle + * @size: the max number of bytes to pad. + * + * Call seq_setwidth() for setting max width, then call seq_printf() etc. and + * finally call seq_pad() to pad the remaining bytes. + */ +static inline void seq_setwidth(struct seq_file *m, size_t size) +{ + m->pad_until = m->count + size; +} +void seq_pad(struct seq_file *m, char c); + char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); diff --git a/include/linux/smp.h b/include/linux/smp.h index 731f5237d5f..5da22ee42e1 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -49,6 +49,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, gfp_t gfp_flags); +void __smp_call_function_single(int cpuid, struct call_single_data *data, + int wait); + #ifdef CONFIG_SMP #include <linux/preempt.h> @@ -95,9 +98,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait); void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); -void __smp_call_function_single(int cpuid, struct call_single_data *data, - int wait); - int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); @@ -106,14 +106,10 @@ void kick_all_cpus_sync(void); /* * Generic and arch helpers */ -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt -#else -static inline void call_function_init(void) { } -#endif /* * Mark the boot cpu "online" so that it can call console drivers in @@ -155,12 +151,6 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } -static inline void __smp_call_function_single(int cpuid, - struct call_single_data *data, int wait) -{ - on_each_cpu(data->func, data->info, wait); -} - #endif /* !SMP */ /* diff --git a/include/linux/srcu.h b/include/linux/srcu.h index c114614ed17..9b058eecd40 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -237,4 +237,18 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __srcu_read_unlock(sp, idx); } +/** + * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock + * + * Converts the preceding srcu_read_unlock into a two-way memory barrier. + * + * Call this after srcu_read_unlock, to guarantee that all memory operations + * that occur after smp_mb__after_srcu_read_unlock will appear to happen after + * the preceding srcu_read_unlock. + */ +static inline void smp_mb__after_srcu_read_unlock(void) +{ + /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */ +} + #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 8d4fa82bfb9..c0f75261a72 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -139,7 +139,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte); #else #define make_migration_entry(page, write) swp_entry(0, 0) @@ -151,8 +152,8 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } -static inline void migration_entry_wait_huge(struct mm_struct *mm, - pte_t *pte) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) { } static inline int is_write_migration_entry(swp_entry_t entry) { return 0; diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 36d36cc8932..e4abb84199b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -51,11 +51,11 @@ int virtqueue_add_sgs(struct virtqueue *vq, void *data, gfp_t gfp); -void virtqueue_kick(struct virtqueue *vq); +bool virtqueue_kick(struct virtqueue *vq); bool virtqueue_kick_prepare(struct virtqueue *vq); -void virtqueue_notify(struct virtqueue *vq); +bool virtqueue_notify(struct virtqueue *vq); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); @@ -73,6 +73,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq); unsigned int virtqueue_get_vring_size(struct virtqueue *vq); +bool virtqueue_is_broken(struct virtqueue *vq); + /** * virtio_device - representation of a device using virtio * @index: unique position on the virtio bus diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 29b9104232b..e8f8f71e843 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -96,33 +96,6 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, return test_bit(fbit, vdev->features); } -/** - * virtio_config_val - look for a feature and get a virtio config entry. - * @vdev: the virtio device - * @fbit: the feature bit - * @offset: the type to search for. - * @v: a pointer to the value to fill in. - * - * The return value is -ENOENT if the feature doesn't exist. Otherwise - * the config value is copied into whatever is pointed to by v. */ -#define virtio_config_val(vdev, fbit, offset, v) \ - virtio_config_buf((vdev), (fbit), (offset), (v), sizeof(*v)) - -#define virtio_config_val_len(vdev, fbit, offset, v, len) \ - virtio_config_buf((vdev), (fbit), (offset), (v), (len)) - -static inline int virtio_config_buf(struct virtio_device *vdev, - unsigned int fbit, - unsigned int offset, - void *buf, unsigned len) -{ - if (!virtio_has_feature(vdev, fbit)) - return -ENOENT; - - vdev->config->get(vdev, offset, buf, len); - return 0; -} - static inline struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, vq_callback_t *c, const char *n) @@ -162,5 +135,139 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu) return 0; } +/* Config space accessors. */ +#define virtio_cread(vdev, structname, member, ptr) \ + do { \ + /* Must match the member's type, and be integer */ \ + if (!typecheck(typeof((((structname*)0)->member)), *(ptr))) \ + (*ptr) = 1; \ + \ + switch (sizeof(*ptr)) { \ + case 1: \ + *(ptr) = virtio_cread8(vdev, \ + offsetof(structname, member)); \ + break; \ + case 2: \ + *(ptr) = virtio_cread16(vdev, \ + offsetof(structname, member)); \ + break; \ + case 4: \ + *(ptr) = virtio_cread32(vdev, \ + offsetof(structname, member)); \ + break; \ + case 8: \ + *(ptr) = virtio_cread64(vdev, \ + offsetof(structname, member)); \ + break; \ + default: \ + BUG(); \ + } \ + } while(0) + +/* Config space accessors. */ +#define virtio_cwrite(vdev, structname, member, ptr) \ + do { \ + /* Must match the member's type, and be integer */ \ + if (!typecheck(typeof((((structname*)0)->member)), *(ptr))) \ + BUG_ON((*ptr) == 1); \ + \ + switch (sizeof(*ptr)) { \ + case 1: \ + virtio_cwrite8(vdev, \ + offsetof(structname, member), \ + *(ptr)); \ + break; \ + case 2: \ + virtio_cwrite16(vdev, \ + offsetof(structname, member), \ + *(ptr)); \ + break; \ + case 4: \ + virtio_cwrite32(vdev, \ + offsetof(structname, member), \ + *(ptr)); \ + break; \ + case 8: \ + virtio_cwrite64(vdev, \ + offsetof(structname, member), \ + *(ptr)); \ + break; \ + default: \ + BUG(); \ + } \ + } while(0) + +static inline u8 virtio_cread8(struct virtio_device *vdev, unsigned int offset) +{ + u8 ret; + vdev->config->get(vdev, offset, &ret, sizeof(ret)); + return ret; +} + +static inline void virtio_cread_bytes(struct virtio_device *vdev, + unsigned int offset, + void *buf, size_t len) +{ + vdev->config->get(vdev, offset, buf, len); +} + +static inline void virtio_cwrite8(struct virtio_device *vdev, + unsigned int offset, u8 val) +{ + vdev->config->set(vdev, offset, &val, sizeof(val)); +} + +static inline u16 virtio_cread16(struct virtio_device *vdev, + unsigned int offset) +{ + u16 ret; + vdev->config->get(vdev, offset, &ret, sizeof(ret)); + return ret; +} + +static inline void virtio_cwrite16(struct virtio_device *vdev, + unsigned int offset, u16 val) +{ + vdev->config->set(vdev, offset, &val, sizeof(val)); +} + +static inline u32 virtio_cread32(struct virtio_device *vdev, + unsigned int offset) +{ + u32 ret; + vdev->config->get(vdev, offset, &ret, sizeof(ret)); + return ret; +} + +static inline void virtio_cwrite32(struct virtio_device *vdev, + unsigned int offset, u32 val) +{ + vdev->config->set(vdev, offset, &val, sizeof(val)); +} + +static inline u64 virtio_cread64(struct virtio_device *vdev, + unsigned int offset) +{ + u64 ret; + vdev->config->get(vdev, offset, &ret, sizeof(ret)); + return ret; +} + +static inline void virtio_cwrite64(struct virtio_device *vdev, + unsigned int offset, u64 val) +{ + vdev->config->set(vdev, offset, &val, sizeof(val)); +} + +/* Conditional config space accessors. */ +#define virtio_cread_feature(vdev, fbit, structname, member, ptr) \ + ({ \ + int _r = 0; \ + if (!virtio_has_feature(vdev, fbit)) \ + _r = -ENOENT; \ + else \ + virtio_cread((vdev), structname, member, ptr); \ + _r; \ + }) #endif /* _LINUX_VIRTIO_CONFIG_H */ diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index b300787af8e..67e06fe18c0 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -71,7 +71,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, struct virtio_device *vdev, bool weak_barriers, void *pages, - void (*notify)(struct virtqueue *vq), + bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); void vring_del_virtqueue(struct virtqueue *vq); diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h new file mode 100644 index 00000000000..a8f5c32d174 --- /dev/null +++ b/include/trace/events/iommu.h @@ -0,0 +1,162 @@ +/* + * iommu trace points + * + * Copyright (C) 2013 Shuah Khan <shuah.kh@samsung.com> + * + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iommu + +#if !defined(_TRACE_IOMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IOMMU_H + +#include <linux/tracepoint.h> +#include <linux/pci.h> + +struct device; + +DECLARE_EVENT_CLASS(iommu_group_event, + + TP_PROTO(int group_id, struct device *dev), + + TP_ARGS(group_id, dev), + + TP_STRUCT__entry( + __field(int, gid) + __string(device, dev_name(dev)) + ), + + TP_fast_assign( + __entry->gid = group_id; + __assign_str(device, dev_name(dev)); + ), + + TP_printk("IOMMU: groupID=%d device=%s", + __entry->gid, __get_str(device) + ) +); + +DEFINE_EVENT(iommu_group_event, add_device_to_group, + + TP_PROTO(int group_id, struct device *dev), + + TP_ARGS(group_id, dev) + +); + +DEFINE_EVENT(iommu_group_event, remove_device_from_group, + + TP_PROTO(int group_id, struct device *dev), + + TP_ARGS(group_id, dev) +); + +DECLARE_EVENT_CLASS(iommu_device_event, + + TP_PROTO(struct device *dev), + + TP_ARGS(dev), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + ), + + TP_fast_assign( + __assign_str(device, dev_name(dev)); + ), + + TP_printk("IOMMU: device=%s", __get_str(device) + ) +); + +DEFINE_EVENT(iommu_device_event, attach_device_to_domain, + + TP_PROTO(struct device *dev), + + TP_ARGS(dev) +); + +DEFINE_EVENT(iommu_device_event, detach_device_from_domain, + + TP_PROTO(struct device *dev), + + TP_ARGS(dev) +); + +DECLARE_EVENT_CLASS(iommu_map_unmap, + + TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size), + + TP_ARGS(iova, paddr, size), + + TP_STRUCT__entry( + __field(u64, iova) + __field(u64, paddr) + __field(int, size) + ), + + TP_fast_assign( + __entry->iova = iova; + __entry->paddr = paddr; + __entry->size = size; + ), + + TP_printk("IOMMU: iova=0x%016llx paddr=0x%016llx size=0x%x", + __entry->iova, __entry->paddr, __entry->size + ) +); + +DEFINE_EVENT(iommu_map_unmap, map, + + TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size), + + TP_ARGS(iova, paddr, size) +); + +DEFINE_EVENT_PRINT(iommu_map_unmap, unmap, + + TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size), + + TP_ARGS(iova, paddr, size), + + TP_printk("IOMMU: iova=0x%016llx size=0x%x", + __entry->iova, __entry->size + ) +); + +DECLARE_EVENT_CLASS(iommu_error, + + TP_PROTO(struct device *dev, unsigned long iova, int flags), + + TP_ARGS(dev, iova, flags), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(driver, dev_driver_string(dev)) + __field(u64, iova) + __field(int, flags) + ), + + TP_fast_assign( + __assign_str(device, dev_name(dev)); + __assign_str(driver, dev_driver_string(dev)); + __entry->iova = iova; + __entry->flags = flags; + ), + + TP_printk("IOMMU:%s %s iova=0x%016llx flags=0x%04x", + __get_str(driver), __get_str(device), + __entry->iova, __entry->flags + ) +); + +DEFINE_EVENT(iommu_error, io_page_fault, + + TP_PROTO(struct device *dev, unsigned long iova, int flags), + + TP_ARGS(dev, iova, flags) +); +#endif /* _TRACE_IOMMU_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 7005d1109ec..131a0bda7ae 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -296,23 +296,21 @@ DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, TRACE_EVENT( kvm_async_pf_completed, - TP_PROTO(unsigned long address, struct page *page, u64 gva), - TP_ARGS(address, page, gva), + TP_PROTO(unsigned long address, u64 gva), + TP_ARGS(address, gva), TP_STRUCT__entry( __field(unsigned long, address) - __field(pfn_t, pfn) __field(u64, gva) ), TP_fast_assign( __entry->address = address; - __entry->pfn = page ? page_to_pfn(page) : 0; __entry->gva = gva; ), - TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva, - __entry->address, __entry->pfn) + TP_printk("gva %#llx address %#lx", __entry->gva, + __entry->address) ); #endif diff --git a/include/trace/events/swiotlb.h b/include/trace/events/swiotlb.h new file mode 100644 index 00000000000..7ea4c5e7c44 --- /dev/null +++ b/include/trace/events/swiotlb.h @@ -0,0 +1,46 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM swiotlb + +#if !defined(_TRACE_SWIOTLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SWIOTLB_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(swiotlb_bounced, + + TP_PROTO(struct device *dev, + dma_addr_t dev_addr, + size_t size, + int swiotlb_force), + + TP_ARGS(dev, dev_addr, size, swiotlb_force), + + TP_STRUCT__entry( + __string( dev_name, dev_name(dev) ) + __field( u64, dma_mask ) + __field( dma_addr_t, dev_addr ) + __field( size_t, size ) + __field( int, swiotlb_force ) + ), + + TP_fast_assign( + __assign_str(dev_name, dev_name(dev)); + __entry->dma_mask = (dev->dma_mask ? *dev->dma_mask : 0); + __entry->dev_addr = dev_addr; + __entry->size = size; + __entry->swiotlb_force = swiotlb_force; + ), + + TP_printk("dev_name: %s dma_mask=%llx dev_addr=%llx " + "size=%zu %s", + __get_str(dev_name), + __entry->dma_mask, + (unsigned long long)__entry->dev_addr, + __entry->size, + __entry->swiotlb_force ? "swiotlb_force" : "" ) +); + +#endif /* _TRACE_SWIOTLB_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 99c25338ede..902f1246187 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -518,6 +518,10 @@ struct kvm_ppc_smmu_info { /* machine type bits, to be used as argument to KVM_CREATE_VM */ #define KVM_VM_S390_UCONTROL 1 +/* on ppc, 0 indicate default, 1 should force HV and 2 PR */ +#define KVM_VM_PPC_HV 1 +#define KVM_VM_PPC_PR 2 + #define KVM_S390_SIE_PAGE_OFFSET 1 /* @@ -541,6 +545,7 @@ struct kvm_ppc_smmu_info { #define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 #define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 #define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 +#define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) /* * Extension capability list. @@ -668,6 +673,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IRQ_XICS 92 #define KVM_CAP_ARM_EL1_32BIT 93 #define KVM_CAP_SPAPR_MULTITCE 94 +#define KVM_CAP_EXT_EMUL_CPUID 95 #ifdef KVM_CAP_IRQ_ROUTING @@ -843,6 +849,10 @@ struct kvm_device_attr { #define KVM_DEV_TYPE_FSL_MPIC_20 1 #define KVM_DEV_TYPE_FSL_MPIC_42 2 #define KVM_DEV_TYPE_XICS 3 +#define KVM_DEV_TYPE_VFIO 4 +#define KVM_DEV_VFIO_GROUP 1 +#define KVM_DEV_VFIO_GROUP_ADD 1 +#define KVM_DEV_VFIO_GROUP_DEL 2 /* * ioctls for VM fds @@ -1012,6 +1022,7 @@ struct kvm_s390_ucas_mapping { /* VM is being stopped by host */ #define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad) #define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init) +#define KVM_ARM_PREFERRED_TARGET _IOR(KVMIO, 0xaf, struct kvm_vcpu_init) #define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 2944278a8ba..77c60311a6c 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -71,6 +71,6 @@ #define USBDEVICE_SUPER_MAGIC 0x9fa2 #define MTD_INODE_FS_MAGIC 0x11307854 #define ANON_INODE_FS_MAGIC 0x09041934 - +#define BTRFS_TEST_MAGIC 0x73727279 #endif /* __LINUX_MAGIC_H__ */ diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h index 7000bb1f6e9..42721d13a10 100644 --- a/include/xen/interface/physdev.h +++ b/include/xen/interface/physdev.h @@ -231,6 +231,17 @@ struct physdev_get_free_pirq { #define XEN_PCI_DEV_VIRTFN 0x2 #define XEN_PCI_DEV_PXM 0x4 +#define XEN_PCI_MMCFG_RESERVED 0x1 + +#define PHYSDEVOP_pci_mmcfg_reserved 24 +struct physdev_pci_mmcfg_reserved { + uint64_t address; + uint16_t segment; + uint8_t start_bus; + uint8_t end_bus; + uint32_t flags; +}; + #define PHYSDEVOP_pci_device_add 25 struct physdev_pci_device_add { /* IN */ diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index de8bcc641c4..8b2eb93ae8b 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -1,6 +1,7 @@ #ifndef __LINUX_SWIOTLB_XEN_H #define __LINUX_SWIOTLB_XEN_H +#include <linux/dma-direction.h> #include <linux/swiotlb.h> extern int xen_swiotlb_init(int verbose, bool early); @@ -55,4 +56,6 @@ xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); extern int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask); +extern int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask); #endif /* __LINUX_SWIOTLB_XEN_H */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index d6fe062cad6..fb2ea8f2655 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -19,10 +19,11 @@ void xen_arch_resume(void); int xen_setup_shutdown_event(void); extern unsigned long *xen_contiguous_bitmap; -int xen_create_contiguous_region(unsigned long vstart, unsigned int order, - unsigned int address_bits); +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle); -void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); struct vm_area_struct; int xen_remap_domain_mfn_range(struct vm_area_struct *vma, diff --git a/init/main.c b/init/main.c index 6ad1a533a8c..01573fdfa18 100644 --- a/init/main.c +++ b/init/main.c @@ -131,6 +131,8 @@ char __initdata boot_command_line[COMMAND_LINE_SIZE]; char *saved_command_line; /* Command line for parameter parsing */ static char *static_command_line; +/* Command line for per-initcall parameter parsing */ +static char *initcall_command_line; static char *execute_command; static char *ramdisk_execute_command; @@ -354,6 +356,7 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } static void __init setup_command_line(char *command_line) { saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); + initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1); static_command_line = alloc_bootmem(strlen (command_line)+1); strcpy (saved_command_line, boot_command_line); strcpy (static_command_line, command_line); @@ -473,7 +476,7 @@ static void __init mm_init(void) mem_init(); kmem_cache_init(); percpu_init_late(); - pgtable_cache_init(); + pgtable_init(); vmalloc_init(); } @@ -751,9 +754,9 @@ static void __init do_initcall_level(int level) extern const struct kernel_param __start___param[], __stop___param[]; initcall_t *fn; - strcpy(static_command_line, saved_command_line); + strcpy(initcall_command_line, saved_command_line); parse_args(initcall_level_names[level], - static_command_line, __start___param, + initcall_command_line, __start___param, __stop___param - __start___param, level, level, &repair_env_string); diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 94fabd534b0..2a202a84675 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) + def_bool HIGH_RES_TIMERS diff --git a/kernel/bounds.c b/kernel/bounds.c index e8ca97b5c38..578782ef6ae 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -11,6 +11,7 @@ #include <linux/kbuild.h> #include <linux/page_cgroup.h> #include <linux/log2.h> +#include <linux/spinlock.h> void foo(void) { @@ -21,5 +22,6 @@ void foo(void) #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif + DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); /* End of constants */ } diff --git a/kernel/fork.c b/kernel/fork.c index f6d11fc67f7..728d5be9548 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; - mm->nr_ptes = 0; + atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); @@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON(mm->pmd_huge_pte); #endif } @@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif if (!mm_init(mm, tsk)) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8807061ca00..9328b80eaf1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -207,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +static atomic_t reset_hung_task = ATOMIC_INIT(0); + +void reset_hung_task_detector(void) +{ + atomic_set(&reset_hung_task, 1); +} +EXPORT_SYMBOL_GPL(reset_hung_task_detector); + /* * kthread which checks for tasks stuck in D state */ @@ -220,6 +228,9 @@ static int watchdog(void *dummy) while (schedule_timeout_interruptible(timeout_jiffies(timeout))) timeout = sysctl_hung_task_timeout_secs; + if (atomic_xchg(&reset_hung_task, 0)) + continue; + check_hung_uninterruptible_tasks(timeout); } diff --git a/kernel/module.c b/kernel/module.c index af5ebd21d77..f5a3b1e8ec5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -641,8 +641,6 @@ static int module_unload_init(struct module *mod) /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); - /* Backwards compatibility macros put refcount during init. */ - mod->waiter = current; return 0; } @@ -768,16 +766,9 @@ static int __try_stop_module(void *_sref) static int try_stop_module(struct module *mod, int flags, int *forced) { - if (flags & O_NONBLOCK) { - struct stopref sref = { mod, flags, forced }; + struct stopref sref = { mod, flags, forced }; - return stop_machine(__try_stop_module, &sref, NULL); - } else { - /* We don't need to stop the machine for this. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - return 0; - } + return stop_machine(__try_stop_module, &sref, NULL); } unsigned long module_refcount(struct module *mod) @@ -810,21 +801,6 @@ EXPORT_SYMBOL(module_refcount); /* This exists whether we can unload or not */ static void free_module(struct module *mod); -static void wait_for_zero_refcount(struct module *mod) -{ - /* Since we might sleep for some time, release the mutex first */ - mutex_unlock(&module_mutex); - for (;;) { - pr_debug("Looking at refcount...\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - if (module_refcount(mod) == 0) - break; - schedule(); - } - current->state = TASK_RUNNING; - mutex_lock(&module_mutex); -} - SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { @@ -839,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; + if (!(flags & O_NONBLOCK)) { + printk(KERN_WARNING + "waiting module removal not supported: please upgrade"); + } + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -856,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Doing init or already dying? */ if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count and wake up - waiter --RR */ + /* FIXME: if (force), slam module count damn the torpedoes */ pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; @@ -873,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } - /* Set this up before setting mod->state */ - mod->waiter = current; - /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; - /* Never wait if forced. */ - if (!forced && module_refcount(mod) != 0) - wait_for_zero_refcount(mod); - mutex_unlock(&module_mutex); /* Final destruction now no one is using it. */ if (mod->exit != NULL) @@ -1002,9 +975,6 @@ void module_put(struct module *module) __this_cpu_inc(module->refptr->decs); trace_module_put(module, _RET_IP_); - /* Maybe they're waiting for us to drop reference? */ - if (unlikely(!module_is_live(module))) - wake_up_process(module->waiter); preempt_enable(); } } @@ -2728,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return 0; } -static void find_module_sections(struct module *mod, struct load_info *info) +static int find_module_sections(struct module *mod, struct load_info *info) { mod->kp = section_objs(info, "__param", sizeof(*mod->kp), &mod->num_kp); @@ -2758,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_CONSTRUCTORS mod->ctors = section_objs(info, ".ctors", sizeof(*mod->ctors), &mod->num_ctors); + if (!mod->ctors) + mod->ctors = section_objs(info, ".init_array", + sizeof(*mod->ctors), &mod->num_ctors); + else if (find_sec(info, ".init_array")) { + /* + * This shouldn't happen with same compiler and binutils + * building all parts of the module. + */ + printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + mod->name); + return -EINVAL; + } #endif #ifdef CONFIG_TRACEPOINTS @@ -2795,6 +2777,8 @@ static void find_module_sections(struct module *mod, struct load_info *info) info->debug = section_objs(info, "__verbose", sizeof(*info->debug), &info->num_debug); + + return 0; } static int move_module(struct module *mod, struct load_info *info) @@ -3248,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, info); + err = find_module_sections(mod, info); + if (err) + goto free_unload; err = check_module_license_and_versions(mod); if (err) diff --git a/kernel/smp.c b/kernel/smp.c index 46116100f0e..bd9f9402883 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -15,7 +15,6 @@ #include "smpboot.h" -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, CSD_FLAG_WAIT = 0x02, @@ -140,8 +139,7 @@ static void csd_unlock(struct call_single_data *csd) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static -void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -464,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); -#endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; diff --git a/kernel/softirq.c b/kernel/softirq.c index b2498835345..11025ccc06d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,8 +6,6 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - * - * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/export.h> @@ -627,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, } EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -/* - * Remote softirq bits - */ - -DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); -EXPORT_PER_CPU_SYMBOL(softirq_work_list); - -static void __local_trigger(struct call_single_data *cp, int softirq) -{ - struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); - - list_add_tail(&cp->list, head); - - /* Trigger the softirq only if the list was previously empty. */ - if (head->next == &cp->list) - raise_softirq_irqoff(softirq); -} - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static void remote_softirq_receive(void *data) -{ - struct call_single_data *cp = data; - unsigned long flags; - int softirq; - - softirq = *(int *)cp->info; - local_irq_save(flags); - __local_trigger(cp, softirq); - local_irq_restore(flags); -} - -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - if (cpu_online(cpu)) { - cp->func = remote_softirq_receive; - cp->info = &softirq; - cp->flags = 0; - - __smp_call_function_single(cpu, cp, 0); - return 0; - } - return 1; -} -#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - return 1; -} -#endif - -/** - * __send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @this_cpu: the currently executing cpu - * @softirq: the softirq for the work - * - * Attempt to schedule softirq work on a remote cpu. If this cannot be - * done, the work is instead queued up on the local cpu. - * - * Interrupts must be disabled. - */ -void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) -{ - if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) - __local_trigger(cp, softirq); -} -EXPORT_SYMBOL(__send_remote_softirq); - -/** - * send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @softirq: the softirq for the work - * - * Like __send_remote_softirq except that disabling interrupts and - * computing the current cpu is done for the caller. - */ -void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - unsigned long flags; - int this_cpu; - - local_irq_save(flags); - this_cpu = smp_processor_id(); - __send_remote_softirq(cp, cpu, this_cpu, softirq); - local_irq_restore(flags); -} -EXPORT_SYMBOL(send_remote_softirq); - -static int remote_softirq_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - int i; - - local_irq_disable(); - for (i = 0; i < NR_SOFTIRQS; i++) { - struct list_head *head = &per_cpu(softirq_work_list[i], cpu); - struct list_head *local_head; - - if (list_empty(head)) - continue; - - local_head = &__get_cpu_var(softirq_work_list[i]); - list_splice_init(head, local_head); - raise_softirq_irqoff(i); - } - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block remote_softirq_cpu_notifier = { - .notifier_call = remote_softirq_cpu_notify, -}; - void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { - int i; - per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; - for (i = 0; i < NR_SOFTIRQS; i++) - INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } - register_hotcpu_notifier(&remote_softirq_cpu_notifier); - open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --git a/kernel/up.c b/kernel/up.c index 630d72bf7e4..509403e3fbc 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +void __smp_call_function_single(int cpu, struct call_single_data *csd, + int wait) +{ + unsigned long flags; + + local_irq_save(flags); + csd->func(csd->info); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__smp_call_function_single); + int on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/lib/Kconfig b/lib/Kconfig index 75485e163ca..06dc74200a5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,13 +51,6 @@ config PERCPU_RWSEM config ARCH_USE_CMPXCHG_LOCKREF bool -config CMPXCHG_LOCKREF - def_bool y if ARCH_USE_CMPXCHG_LOCKREF - depends on SMP - depends on !GENERIC_LOCKBREAK - depends on !DEBUG_SPINLOCK - depends on !DEBUG_LOCK_ALLOC - config CRC_CCITT tristate "CRC-CCITT functions" help diff --git a/lib/kfifo.c b/lib/kfifo.c index 7b7f83027b7..d79b9d22206 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -215,7 +215,7 @@ static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, * incrementing the fifo->in index counter */ smp_wmb(); - *copied = len - ret; + *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } @@ -275,7 +275,7 @@ static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, * incrementing the fifo->out index counter */ smp_wmb(); - *copied = len - ret; + *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } diff --git a/lib/llist.c b/lib/llist.c index 4a70d120138..f76196d0740 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -81,3 +81,25 @@ struct llist_node *llist_del_first(struct llist_head *head) return entry; } EXPORT_SYMBOL_GPL(llist_del_first); + +/** + * llist_reverse_order - reverse order of a llist chain + * @head: first item of the list to be reversed + * + * Reverse the order of a chain of llist entries and return the + * new first entry. + */ +struct llist_node *llist_reverse_order(struct llist_node *head) +{ + struct llist_node *new_head = NULL; + + while (head) { + struct llist_node *tmp = head; + head = head->next; + tmp->next = new_head; + new_head = tmp; + } + + return new_head; +} +EXPORT_SYMBOL_GPL(llist_reverse_order); diff --git a/lib/lockref.c b/lib/lockref.c index af6e95d0bed..d2b123f8456 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -1,7 +1,7 @@ #include <linux/export.h> #include <linux/lockref.h> -#ifdef CONFIG_CMPXCHG_LOCKREF +#if USE_CMPXCHG_LOCKREF /* * Allow weakly-ordered memory architectures to provide barrier-less diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 4e8686c7e5a..e4399fa65ad 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -38,6 +38,9 @@ #include <linux/bootmem.h> #include <linux/iommu-helper.h> +#define CREATE_TRACE_POINTS +#include <trace/events/swiotlb.h> + #define OFFSET(val,align) ((unsigned long) \ ( (val) & ( (align) - 1))) @@ -502,6 +505,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); + dev_warn(hwdev, "swiotlb buffer is full\n"); return SWIOTLB_MAP_ERROR; found: spin_unlock_irqrestore(&io_tlb_lock, flags); @@ -726,6 +730,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, if (dma_capable(dev, dev_addr, size) && !swiotlb_force) return dev_addr; + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + /* Oh well, have to allocate and map a bounce buffer. */ map = map_single(dev, phys, size, dir); if (map == SWIOTLB_MAP_ERROR) { diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 48586ac3a62..10909c57149 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1712,18 +1712,16 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; case FORMAT_TYPE_NRCHARS: { - u8 qualifier = spec.qualifier; + /* + * Since %n poses a greater security risk than + * utility, ignore %n and skip its argument. + */ + void *skip_arg; - if (qualifier == 'l') { - long *ip = va_arg(args, long *); - *ip = (str - buf); - } else if (_tolower(qualifier) == 'z') { - size_t *ip = va_arg(args, size_t *); - *ip = (str - buf); - } else { - int *ip = va_arg(args, int *); - *ip = (str - buf); - } + WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", + old_fmt); + + skip_arg = va_arg(args, void *); break; } diff --git a/mm/Kconfig b/mm/Kconfig index 3f4ffda152b..de31af25620 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -218,9 +218,11 @@ config SPLIT_PTLOCK_CPUS int default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 - default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + boolean + # # support for memory balloon compaction config BALLOON_COMPACTION diff --git a/mm/filemap.c b/mm/filemap.c index ae4846ff484..b7749a92021 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1090,7 +1090,6 @@ static void shrink_readahead_size_eio(struct file *filp, * @filp: the file to read * @ppos: current file position * @desc: read_descriptor - * @actor: read method * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1099,7 +1098,7 @@ static void shrink_readahead_size_eio(struct file *filp, * of the logic when it comes to error handling etc. */ static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc, read_actor_t actor) + read_descriptor_t *desc) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1200,13 +1199,14 @@ page_ok: * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * - * The actor routine returns how many bytes were actually used.. + * The file_read_actor routine returns how many bytes were + * actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = file_read_actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; @@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (desc.count == 0) continue; desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); + do_generic_file_read(filp, ppos, &desc); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0556c6a4495..bccd5a628ea 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -710,6 +710,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct page *page) { pgtable_t pgtable; + spinlock_t *ptl; VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); @@ -724,9 +725,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); @@ -738,8 +739,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm->nr_ptes++; - spin_unlock(&mm->page_table_lock); + atomic_long_inc(&mm->nr_ptes); + spin_unlock(ptl); } return 0; @@ -759,6 +760,7 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } +/* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -771,7 +773,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); return true; } @@ -790,6 +792,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { + spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; @@ -802,10 +805,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, zero_page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); @@ -838,6 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) { + spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable; @@ -848,8 +852,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; - spin_lock(&dst_mm->page_table_lock); - spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; @@ -858,7 +863,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } /* - * mm->page_table_lock is enough to be sure that huge zero pmd is not + * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table. */ @@ -879,8 +884,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, } if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); pte_free(dst_mm, pgtable); wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ @@ -896,12 +901,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - dst_mm->nr_ptes++; + atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); out: return ret; } @@ -912,10 +917,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm, pmd_t *pmd, pmd_t orig_pmd, int dirty) { + spinlock_t *ptl; pmd_t entry; unsigned long haddr; - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto unlock; @@ -925,13 +931,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm, update_mmu_cache_pmd(vma, address, pmd); unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; struct page *page; @@ -958,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_page; @@ -985,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); put_huge_zero_page(); inc_mm_counter(mm, MM_ANONPAGES); @@ -995,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, out: return ret; out_free_page: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_page(page); put_page(page); @@ -1009,6 +1016,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1055,7 +1063,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); @@ -1081,7 +1089,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1092,7 +1100,7 @@ out: return ret; out_free_pages: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1107,17 +1115,19 @@ out_free_pages: int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { + spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + ptl = pmd_lockptr(mm, pmd); VM_BUG_ON(!vma->anon_vma); haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; @@ -1133,7 +1143,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) @@ -1180,11 +1190,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (page) put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); put_page(new_page); goto out_mn; @@ -1206,13 +1216,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out_mn: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return ret; } @@ -1224,7 +1234,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; @@ -1271,6 +1281,7 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + spinlock_t *ptl; struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; @@ -1280,7 +1291,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool migrated = false; int flags = 0; - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; @@ -1318,7 +1329,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * relock and check_same as the page may no longer be mapped. * As the fault is being retried, do not account for it. */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1326,13 +1337,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Page is misplaced, serialise migrations and parallel THP splits */ get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (!page_locked) lock_page(page); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); @@ -1344,7 +1355,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (migrated) { @@ -1361,7 +1372,7 @@ clear_pmdnuma: update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out: if (anon_vma) @@ -1376,9 +1387,10 @@ out: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { struct page *page; pgtable_t pgtable; pmd_t orig_pmd; @@ -1392,8 +1404,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); put_huge_zero_page(); } else { page = pmd_page(orig_pmd); @@ -1401,8 +1413,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); tlb_remove_page(tlb, page); } pte_free(tlb->mm, pgtable); @@ -1415,14 +1427,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { /* * All logical pages in the range are present * if backed by a huge page. */ - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); memset(vec, 1, (end - addr) >> PAGE_SHIFT); ret = 1; } @@ -1435,6 +1448,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { + spinlock_t *old_ptl, *new_ptl; int ret = 0; pmd_t pmd; @@ -1455,12 +1469,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - ret = __pmd_trans_huge_lock(old_pmd, vma); + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); if (ret == 1) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - spin_unlock(&mm->page_table_lock); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); } out: return ret; @@ -1476,9 +1499,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; ret = 1; if (!prot_numa) { @@ -1507,7 +1531,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (ret == HPAGE_PMD_NR) set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); } return ret; @@ -1520,12 +1544,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns 1, this routine returns without unlocking page * table locks. So callers must unlock them. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { - spin_lock(&vma->vm_mm->page_table_lock); + *ptl = pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); wait_split_huge_page(vma->anon_vma, pmd); return -1; } else { @@ -1534,27 +1559,37 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return 1; } } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); return 0; } +/* + * This function returns whether a given @page is mapped onto the @address + * in the virtual space of @mm. + * + * When it's true, this function returns *pmd with holding the page table lock + * and passing it back to the caller via @ptl. + * If it's false, returns NULL without holding the page table lock. + */ pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag) + enum page_check_address_pmd_flag flag, + spinlock_t **ptl) { - pmd_t *pmd, *ret = NULL; + pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) - goto out; + return NULL; pmd = mm_find_pmd(mm, address); if (!pmd) - goto out; + return NULL; + *ptl = pmd_lock(mm, pmd); if (pmd_none(*pmd)) - goto out; + goto unlock; if (pmd_page(*pmd) != page) - goto out; + goto unlock; /* * split_vma() may create temporary aliased mappings. There is * no risk as long as all huge pmd are found and have their @@ -1564,14 +1599,15 @@ pmd_t *page_check_address_pmd(struct page *page, */ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && pmd_trans_splitting(*pmd)) - goto out; + goto unlock; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); - ret = pmd; + return pmd; } -out: - return ret; +unlock: + spin_unlock(*ptl); + return NULL; } static int __split_huge_page_splitting(struct page *page, @@ -1579,6 +1615,7 @@ static int __split_huge_page_splitting(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd; int ret = 0; /* For mmu_notifiers */ @@ -1586,9 +1623,8 @@ static int __split_huge_page_splitting(struct page *page, const unsigned long mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); if (pmd) { /* * We can't temporarily set the pmd to null in order @@ -1599,8 +1635,8 @@ static int __split_huge_page_splitting(struct page *page, */ pmdp_splitting_flush(vma, address, pmd); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; @@ -1731,14 +1767,14 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -1793,8 +1829,8 @@ static int __split_huge_page_map(struct page *page, pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); return ret; } @@ -2346,7 +2382,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte_t *pte; pgtable_t pgtable; struct page *new_page; - spinlock_t *ptl; + spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2389,12 +2425,12 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); - ptl = pte_lockptr(mm, pmd); + pte_ptl = pte_lockptr(mm, pmd); mmun_start = address; mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); /* probably unnecessary */ + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes * any huge TLB entry from the CPU so we won't allow @@ -2402,16 +2438,16 @@ static void collapse_huge_page(struct mm_struct *mm, * to avoid the risk of CPU bugs in that area. */ _pmd = pmdp_clear_flush(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(ptl); + spin_unlock(pte_ptl); if (unlikely(!isolated)) { pte_unmap(pte); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing @@ -2419,7 +2455,7 @@ static void collapse_huge_page(struct mm_struct *mm, * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -2430,7 +2466,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -2445,13 +2481,13 @@ static void collapse_huge_page(struct mm_struct *mm, */ smp_wmb(); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); *hpage = NULL; @@ -2780,6 +2816,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { + spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -2792,22 +2829,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, mmun_end = haddr + HPAGE_PMD_SIZE; again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0b7656e804d..7d57af21f49 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2376,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; @@ -2387,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (dst_pte == src_pte) continue; - spin_lock(&dst->page_table_lock); - spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2398,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); } return 0; @@ -2442,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address; pte_t *ptep; pte_t pte; + spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -2455,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: - spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) - continue; + goto unlock; pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) - continue; + goto unlock; /* * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { huge_pte_clear(mm, address, ptep); - continue; + goto unlock; } page = pte_page(pte); @@ -2484,7 +2487,7 @@ again: */ if (ref_page) { if (page != ref_page) - continue; + goto unlock; /* * Mark the VMA as having unmapped its page so that @@ -2501,13 +2504,18 @@ again: page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (force_flush) { + spin_unlock(ptl); break; + } /* Bail out after unmapping reference page if supplied */ - if (ref_page) + if (ref_page) { + spin_unlock(ptl); break; + } +unlock: + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * mmu_gather ran out of room to batch pages, we break out of * the PTE lock to avoid doing the potential expensive TLB invalidate @@ -2613,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page) + struct page *pagecache_page, spinlock_t *ptl) { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; @@ -2647,8 +2655,8 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page_table_lock as buddy allocator may be called */ - spin_unlock(&mm->page_table_lock); + /* Drop page table lock as buddy allocator may be called */ + spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -2666,13 +2674,13 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(huge_pte_none(pte)); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* - * race occurs while re-acquiring page_table_lock, and - * our job is done. + * race occurs while re-acquiring page table + * lock, and our job is done. */ return 0; } @@ -2680,7 +2688,7 @@ retry_avoidcopy: } /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (err == -ENOMEM) return VM_FAULT_OOM; else @@ -2695,7 +2703,7 @@ retry_avoidcopy: page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return VM_FAULT_OOM; } @@ -2707,10 +2715,10 @@ retry_avoidcopy: mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* - * Retake the page_table_lock to check for racing updates + * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -2724,13 +2732,13 @@ retry_avoidcopy: /* Make the old page be freed below */ new_page = old_page; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return 0; } @@ -2778,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; struct address_space *mapping; pte_t new_pte; + spinlock_t *ptl; /* * Currently, we are forced to kill the process in the event the @@ -2864,7 +2873,8 @@ retry: goto backout_unlocked; } - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -2885,16 +2895,16 @@ retry: if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(page); out: return ret; backout: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); backout_unlocked: unlock_page(page); put_page(page); @@ -2906,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; + spinlock_t *ptl; int ret; struct page *page = NULL; struct page *pagecache_page = NULL; @@ -2918,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(mm, ptep); + migration_entry_wait_huge(vma, mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -2974,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (page != pagecache_page) lock_page(page); - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_page_table_lock; + goto out_ptl; if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page); - goto out_page_table_lock; + pagecache_page, ptl); + goto out_ptl; } entry = huge_pte_mkdirty(entry); } @@ -2993,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); -out_page_table_lock: - spin_unlock(&mm->page_table_lock); +out_ptl: + spin_unlock(ptl); if (pagecache_page) { unlock_page(pagecache_page); @@ -3020,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + spinlock_t *ptl = NULL; int absent; struct page *page; @@ -3030,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (pte) + ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); /* @@ -3043,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (absent && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); remainder = 0; break; } @@ -3062,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !huge_pte_write(huge_ptep_get(pte)))) { int ret; - spin_unlock(&mm->page_table_lock); + if (pte) + spin_unlock(ptl); ret = hugetlb_fault(mm, vma, vaddr, (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; @@ -3096,8 +3114,8 @@ same_page: */ goto same_page; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); *nr_pages = remainder; *position = vaddr; @@ -3118,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { + spinlock_t *ptl; ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { pages++; + spin_unlock(ptl); continue; } if (!huge_pte_none(huge_ptep_get(ptep))) { @@ -3134,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, set_huge_pte_at(mm, address, ptep, pte); pages++; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: @@ -3298,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) unsigned long saddr; pte_t *spte = NULL; pte_t *pte; + spinlock_t *ptl; if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); @@ -3320,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); + spin_lock(ptl); if (pud_none(*pud)) pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); @@ -3340,7 +3362,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with vma->vm_mm->page_table_lock held. + * called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3cd40b2d5d..f1a0ae6e11b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6605,10 +6605,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -6797,9 +6797,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -6816,7 +6816,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } put_page(page); } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f9d78ec7831..b7c171602ba 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1269,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, &entry)) + if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", diff --git a/mm/memory.c b/mm/memory.c index bf8665849a5..0409e8f43fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - tlb->mm->nr_ptes--; + atomic_long_dec(&tlb->mm->nr_ptes); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -550,6 +550,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address) { + spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); int wait_split_huge_page; if (!new) @@ -570,15 +571,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) wait_split_huge_page = 1; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (new) pte_free(mm, new); if (wait_split_huge_page) @@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, split_huge_page_pmd(vma, address, pmd); goto split_fallthrough; } - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); wait_split_huge_page(vma->anon_vma, pmd); } else { page = follow_trans_huge_pmd(vma, address, pmd, flags); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; goto out; } } else - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* fall through */ } split_fallthrough: @@ -4269,3 +4270,28 @@ void copy_user_huge_page(struct page *dst, struct page *src, } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS +static struct kmem_cache *page_ptl_cachep; +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + +bool ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (!ptl) + return false; + page->ptl = ptl; + return true; +} + +void ptlock_free(struct page *page) +{ + kfree(page->ptl); +} +#endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4cc19f6ab6c..c4403cdf343 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, #ifdef CONFIG_HUGETLB_PAGE int nid; struct page *page; + spinlock_t *ptl; - spin_lock(&vma->vm_mm->page_table_lock); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); page = pte_page(huge_ptep_get((pte_t *)pmd)); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) isolate_huge_page(page, private); unlock: - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); #else BUG(); #endif diff --git a/mm/migrate.c b/mm/migrate.c index dfc8300ecbb..316e720a202 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, addr); if (!ptep) goto out; - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); } else { pmd = mm_find_pmd(mm, addr); if (!pmd) @@ -249,9 +249,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) { - spinlock_t *ptl = &(mm)->page_table_lock; + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); __migration_entry_wait(mm, pte, ptl); } @@ -1666,6 +1667,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long address, struct page *page, int node) { + spinlock_t *ptl; unsigned long haddr = address & HPAGE_PMD_MASK; pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; @@ -1705,9 +1707,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -1752,7 +1754,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * before it's fully transferred to the new page. */ mem_cgroup_end_migration(memcg, page, new_page, true); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(new_page); unlock_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index 5a6baddde15..834b2d785f1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2724,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(atomic_long_read(&mm->nr_ptes) > + (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6738c47f1f7..1e4a600a616 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes + + points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); @@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - task->mm->nr_ptes, + atomic_long_read(&task->mm->nr_ptes), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3929a40bd6c..cbb38545d9d 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -151,14 +151,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -170,14 +170,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, + pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, struct page, lru); list_del(&pgtable->lru); } diff --git a/mm/rmap.c b/mm/rmap.c index fd3ee7a54a1..55c8b8dc9ff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, if (unlikely(PageHuge(page))) { pte = huge_pte_offset(mm, address); - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); goto check; } @@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int referenced = 0; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; - spin_lock(&mm->page_table_lock); /* * rmap might return false positives; we must filter * these out using page_check_address_pmd(). */ pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG); - if (!pmd) { - spin_unlock(&mm->page_table_lock); + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (!pmd) goto out; - } if (vma->vm_flags & VM_LOCKED) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; goto out; @@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } else { pte_t *pte; - spinlock_t *ptl; /* * rmap might return false positives; we must filter diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 990afab2be1..9c5a1aa34d1 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -544,9 +544,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->inuse = false; if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) { - vdev->config->get(vdev, - offsetof(struct virtio_9p_config, tag_len), - &tag_len, sizeof(tag_len)); + virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len); } else { err = -EINVAL; goto out_free_vq; @@ -556,8 +554,9 @@ static int p9_virtio_probe(struct virtio_device *vdev) err = -ENOMEM; goto out_free_vq; } - vdev->config->get(vdev, offsetof(struct virtio_9p_config, tag), - tag, tag_len); + + virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), + tag, tag_len); chan->tag = tag; chan->tag_len = tag_len; err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ec9a9ef4ce5..5afeb5aa4c7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2523,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); - int len; if (fa->fa_type == RTN_BROADCAST || fa->fa_type == RTN_MULTICAST) continue; + seq_setwidth(seq, 127); + if (fi) seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u%n", + "%d\t%08X\t%d\t%u\t%u", fi->fib_dev ? fi->fib_dev->name : "*", prefix, fi->fib_nh->nh_gw, flags, 0, 0, @@ -2541,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, - fi->fib_rtt >> 3, &len); + fi->fib_rtt >> 3); else seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u%n", + "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, - mask, 0, 0, 0, &len); + mask, 0, 0, 0); - seq_printf(seq, "%*s\n", 127 - len, ""); + seq_pad(seq, '\n'); } } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 9afbdb19f4a..cbc85f660d5 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1076,7 +1076,7 @@ void ping_seq_stop(struct seq_file *seq, void *v) EXPORT_SYMBOL_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, - int bucket, int *len) + int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; @@ -1085,7 +1085,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), @@ -1093,23 +1093,22 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops), len); + atomic_read(&sp->sk_drops)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) { + seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-127s\n", - " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; - int len; - ping_v4_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len, ""); + ping_v4_format_sock(v, seq, state->bucket); } + seq_pad(seq, '\n'); return 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 14bba8a1c5a..59a6f8b90cd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2541,13 +2541,13 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct sock *sk, const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid, int *len) + struct seq_file *f, int i, kuid_t uid) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", i, ireq->ir_loc_addr, ntohs(inet_sk(sk)->inet_sport), @@ -2562,11 +2562,10 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->sk_refcnt), - req, - len); + req); } -static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) +static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) { int timer_active; unsigned long timer_expires; @@ -2605,7 +2604,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n", + "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, rx_queue, @@ -2622,12 +2621,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) tp->snd_cwnd, sk->sk_state == TCP_LISTEN ? (fastopenq ? fastopenq->max_qlen : 0) : - (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh), - len); + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } static void get_timewait4_sock(const struct inet_timewait_sock *tw, - struct seq_file *f, int i, int *len) + struct seq_file *f, int i) { __be32 dest, src; __u16 destp, srcp; @@ -2639,10 +2637,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, srcp = ntohs(tw->tw_sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, - atomic_read(&tw->tw_refcnt), tw, len); + atomic_read(&tw->tw_refcnt), tw); } #define TMPSZ 150 @@ -2651,11 +2649,10 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; struct sock *sk = v; - int len; + seq_setwidth(seq, TMPSZ - 1); if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-*s\n", TMPSZ - 1, - " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode"); goto out; @@ -2666,16 +2663,16 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_ESTABLISHED: if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num, &len); + get_timewait4_sock(v, seq, st->num); else - get_tcp4_sock(v, seq, st->num, &len); + get_tcp4_sock(v, seq, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); + get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid); break; } - seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); out: + seq_pad(seq, '\n'); return 0; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 89909dd730d..de86e5bc446 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2331,7 +2331,7 @@ EXPORT_SYMBOL(udp_proc_unregister); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, - int bucket, int *len) + int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; @@ -2340,7 +2340,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), @@ -2348,23 +2348,22 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops), len); + atomic_read(&sp->sk_drops)); } int udp4_seq_show(struct seq_file *seq, void *v) { + seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-127s\n", - " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; - int len; - udp4_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len, ""); + udp4_format_sock(v, seq, state->bucket); } + seq_pad(seq, '\n'); return 0; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 77e38f73349..008214a3d5e 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -595,26 +595,25 @@ static void pn_sock_seq_stop(struct seq_file *seq, void *v) static int pn_sock_seq_show(struct seq_file *seq, void *v) { - int len; - + seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_printf(seq, "%s%n", "pt loc rem rs st tx_queue rx_queue " - " uid inode ref pointer drops", &len); + seq_puts(seq, "pt loc rem rs st tx_queue rx_queue " + " uid inode ref pointer drops"); else { struct sock *sk = v; struct pn_sock *pn = pn_sk(sk); seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu " - "%d %pK %d%n", + "%d %pK %d", sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), atomic_read(&sk->sk_refcnt), sk, - atomic_read(&sk->sk_drops), &len); + atomic_read(&sk->sk_drops)); } - seq_printf(seq, "%*s\n", 127 - len, ""); + seq_pad(seq, '\n'); return 0; } @@ -785,20 +784,19 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v) static int pn_res_seq_show(struct seq_file *seq, void *v) { - int len; - + seq_setwidth(seq, 63); if (v == SEQ_START_TOKEN) - seq_printf(seq, "%s%n", "rs uid inode", &len); + seq_puts(seq, "rs uid inode"); else { struct sock **psk = v; struct sock *sk = *psk; - seq_printf(seq, "%02X %5u %lu%n", + seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), - sock_i_ino(sk), &len); + sock_i_ino(sk)); } - seq_printf(seq, "%*s\n", 63 - len, ""); + seq_pad(seq, '\n'); return 0; } diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index 5ea573b3764..647396baa56 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -79,12 +79,13 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = { */ static int sctp_objcnt_seq_show(struct seq_file *seq, void *v) { - int i, len; + int i; i = (int)*(loff_t *)v; - seq_printf(seq, "%s: %d%n", sctp_dbg_objcnt[i].label, - atomic_read(sctp_dbg_objcnt[i].counter), &len); - seq_printf(seq, "%*s\n", 127 - len, ""); + seq_setwidth(seq, 127); + seq_printf(seq, "%s: %d", sctp_dbg_objcnt[i].label, + atomic_read(sctp_dbg_objcnt[i].counter)); + seq_pad(seq, '\n'); return 0; } diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c index cfe40addda7..2fca916d9ed 100644 --- a/samples/kfifo/bytestream-example.c +++ b/samples/kfifo/bytestream-example.c @@ -64,7 +64,7 @@ static int __init testfunc(void) /* put values into the fifo */ for (i = 0; i != 10; i++) - kfifo_put(&test, &i); + kfifo_put(&test, i); /* show the number of used elements */ printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); @@ -85,7 +85,7 @@ static int __init testfunc(void) kfifo_skip(&test); /* put values into the fifo until is full */ - for (i = 20; kfifo_put(&test, &i); i++) + for (i = 20; kfifo_put(&test, i); i++) ; printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c index 06473791c08..aa243db93f0 100644 --- a/samples/kfifo/dma-example.c +++ b/samples/kfifo/dma-example.c @@ -39,7 +39,7 @@ static int __init example_init(void) kfifo_in(&fifo, "test", 4); for (i = 0; i != 9; i++) - kfifo_put(&fifo, &i); + kfifo_put(&fifo, i); /* kick away first byte */ kfifo_skip(&fifo); diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c index 6f8e79e76c9..8dc3c2e7105 100644 --- a/samples/kfifo/inttype-example.c +++ b/samples/kfifo/inttype-example.c @@ -61,7 +61,7 @@ static int __init testfunc(void) /* put values into the fifo */ for (i = 0; i != 10; i++) - kfifo_put(&test, &i); + kfifo_put(&test, i); /* show the number of used elements */ printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); @@ -78,7 +78,7 @@ static int __init testfunc(void) kfifo_skip(&test); /* put values into the fifo until is full */ - for (i = 20; kfifo_put(&test, &i); i++) + for (i = 20; kfifo_put(&test, i); i++) ; printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 8dcdca27d83..69f0a1417e9 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -79,9 +79,11 @@ modpost = scripts/mod/modpost \ $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S) \ $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w) +MODPOST_OPT=$(subst -i,-n,$(filter -i,$(MAKEFLAGS))) + # We can go over command line length here, so be careful. quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules - cmd_modpost = $(MODLISTCMD) | sed 's/\.ko$$/.o/' | $(modpost) -s -T - + cmd_modpost = $(MODLISTCMD) | sed 's/\.ko$$/.o/' | $(modpost) $(MODPOST_OPT) -s -T - PHONY += __modpost __modpost: $(modules:.ko=.o) FORCE diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index bfcea5d3b27..17855761e6b 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -17,6 +17,7 @@ #include <string.h> #include <limits.h> #include <stdbool.h> +#include <errno.h> #include "modpost.h" #include "../../include/generated/autoconf.h" #include "../../include/linux/license.h" @@ -37,6 +38,8 @@ static int warn_unresolved = 0; /* How a symbol is exported */ static int sec_mismatch_count = 0; static int sec_mismatch_verbose = 1; +/* ignore missing files */ +static int ignore_missing_files; enum export { export_plain, export_unused, export_gpl, @@ -161,7 +164,7 @@ struct symbol { unsigned int vmlinux:1; /* 1 if symbol is defined in vmlinux */ unsigned int kernel:1; /* 1 if symbol is from kernel * (only for external modules) **/ - unsigned int preloaded:1; /* 1 if symbol from Module.symvers */ + unsigned int preloaded:1; /* 1 if symbol from Module.symvers, or crc */ enum export export; /* Type of export */ char name[0]; }; @@ -329,8 +332,11 @@ static void sym_update_crc(const char *name, struct module *mod, { struct symbol *s = find_symbol(name); - if (!s) + if (!s) { s = new_symbol(name, mod, export); + /* Don't complain when we find it later. */ + s->preloaded = 1; + } s->crc = crc; s->crc_valid = 1; } @@ -407,6 +413,11 @@ static int parse_elf(struct elf_info *info, const char *filename) hdr = grab_file(filename, &info->size); if (!hdr) { + if (ignore_missing_files) { + fprintf(stderr, "%s: %s (ignored)\n", filename, + strerror(errno)); + return 0; + } perror(filename); exit(1); } @@ -1852,7 +1863,7 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "\n"); buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n"); buf_printf(b, "\n"); - buf_printf(b, "struct module __this_module\n"); + buf_printf(b, "__visible struct module __this_module\n"); buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n"); buf_printf(b, "\t.name = KBUILD_MODNAME,\n"); if (mod->has_init) @@ -2118,7 +2129,7 @@ int main(int argc, char **argv) struct ext_sym_list *extsym_iter; struct ext_sym_list *extsym_start = NULL; - while ((opt = getopt(argc, argv, "i:I:e:msST:o:awM:K:")) != -1) { + while ((opt = getopt(argc, argv, "i:I:e:mnsST:o:awM:K:")) != -1) { switch (opt) { case 'i': kernel_read = optarg; @@ -2138,6 +2149,9 @@ int main(int argc, char **argv) case 'm': modversions = 1; break; + case 'n': + ignore_missing_files = 1; + break; case 'o': dump_write = optarg; break; diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 9d93f02c628..5e1c7bc73b2 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -184,11 +184,7 @@ static void snd_malloc_dev_iram(struct snd_dma_buffer *dmab, size_t size) /* Assign the pool into private_data field */ dmab->private_data = pool; - dmab->area = (void *)gen_pool_alloc(pool, size); - if (!dmab->area) - return; - - dmab->addr = gen_pool_virt_to_phys(pool, (unsigned long)dmab->area); + dmab->area = gen_pool_dma_alloc(pool, size, &dmab->addr); } /** diff --git a/sound/firewire/dice.c b/sound/firewire/dice.c index 6feee661419..57bcd31fcc1 100644 --- a/sound/firewire/dice.c +++ b/sound/firewire/dice.c @@ -543,7 +543,7 @@ static int dice_change_rate(struct dice *dice, unsigned int clock_rate) __be32 value; int err; - INIT_COMPLETION(dice->clock_accepted); + reinit_completion(&dice->clock_accepted); value = cpu_to_be32(clock_rate | CLOCK_SOURCE_ARX1); err = snd_fw_transaction(dice->unit, TCODE_WRITE_QUADLET_REQUEST, diff --git a/sound/soc/samsung/ac97.c b/sound/soc/samsung/ac97.c index 2acf987844e..350ba23a989 100644 --- a/sound/soc/samsung/ac97.c +++ b/sound/soc/samsung/ac97.c @@ -74,7 +74,7 @@ static void s3c_ac97_activate(struct snd_ac97 *ac97) if (stat == S3C_AC97_GLBSTAT_MAINSTATE_ACTIVE) return; /* Return if already active */ - INIT_COMPLETION(s3c_ac97.done); + reinit_completion(&s3c_ac97.done); ac_glbctrl = readl(s3c_ac97.regs + S3C_AC97_GLBCTRL); ac_glbctrl = S3C_AC97_GLBCTRL_ACLINKON; @@ -103,7 +103,7 @@ static unsigned short s3c_ac97_read(struct snd_ac97 *ac97, s3c_ac97_activate(ac97); - INIT_COMPLETION(s3c_ac97.done); + reinit_completion(&s3c_ac97.done); ac_codec_cmd = readl(s3c_ac97.regs + S3C_AC97_CODEC_CMD); ac_codec_cmd = S3C_AC97_CODEC_CMD_READ | AC_CMD_ADDR(reg); @@ -140,7 +140,7 @@ static void s3c_ac97_write(struct snd_ac97 *ac97, unsigned short reg, s3c_ac97_activate(ac97); - INIT_COMPLETION(s3c_ac97.done); + reinit_completion(&s3c_ac97.done); ac_codec_cmd = readl(s3c_ac97.regs + S3C_AC97_CODEC_CMD); ac_codec_cmd = AC_CMD_ADDR(reg) | AC_CMD_DATA(val); diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index da7a1955828..bdb71a26ae3 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -41,13 +41,14 @@ struct vdev_info { struct vhost_memory *mem; }; -void vq_notify(struct virtqueue *vq) +bool vq_notify(struct virtqueue *vq) { struct vq_info *info = vq->priv; unsigned long long v = 1; int r; r = write(info->kick, &v, sizeof v); assert(r == sizeof v); + return true; } void vq_callback(struct virtqueue *vq) @@ -171,7 +172,8 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, GFP_ATOMIC); if (likely(r == 0)) { ++started; - virtqueue_kick(vq->vq); + if (unlikely(!virtqueue_kick(vq->vq)) + r = -1; } } else r = -1; diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c index d053ea40c00..14a4f4cab5b 100644 --- a/tools/virtio/vringh_test.c +++ b/tools/virtio/vringh_test.c @@ -22,7 +22,7 @@ static u64 user_addr_offset; #define RINGSIZE 256 #define ALIGN 4096 -static void never_notify_host(struct virtqueue *vq) +static bool never_notify_host(struct virtqueue *vq) { abort(); } @@ -65,17 +65,22 @@ struct guest_virtio_device { unsigned long notifies; }; -static void parallel_notify_host(struct virtqueue *vq) +static bool parallel_notify_host(struct virtqueue *vq) { + int rc; struct guest_virtio_device *gvdev; gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev); - write(gvdev->to_host_fd, "", 1); + rc = write(gvdev->to_host_fd, "", 1); + if (rc < 0) + return false; gvdev->notifies++; + return true; } -static void no_notify_host(struct virtqueue *vq) +static bool no_notify_host(struct virtqueue *vq) { + return true; } #define NUM_XFERS (10000000) diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 779262f59e2..fbe1a48bd62 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -27,3 +27,6 @@ config HAVE_KVM_MSI config HAVE_KVM_CPU_RELAX_INTERCEPT bool + +config KVM_VFIO + bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 8a39dda7a32..8631d9c1432 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -56,7 +56,6 @@ void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) static void async_pf_execute(struct work_struct *work) { - struct page *page = NULL; struct kvm_async_pf *apf = container_of(work, struct kvm_async_pf, work); struct mm_struct *mm = apf->mm; @@ -68,14 +67,12 @@ static void async_pf_execute(struct work_struct *work) use_mm(mm); down_read(&mm->mmap_sem); - get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); + get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL); up_read(&mm->mmap_sem); unuse_mm(mm); spin_lock(&vcpu->async_pf.lock); list_add_tail(&apf->link, &vcpu->async_pf.done); - apf->page = page; - apf->done = true; spin_unlock(&vcpu->async_pf.lock); /* @@ -83,7 +80,7 @@ static void async_pf_execute(struct work_struct *work) * this point */ - trace_kvm_async_pf_completed(addr, page, gva); + trace_kvm_async_pf_completed(addr, gva); if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); @@ -99,9 +96,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) struct kvm_async_pf *work = list_entry(vcpu->async_pf.queue.next, typeof(*work), queue); - cancel_work_sync(&work->work); list_del(&work->queue); - if (!work->done) { /* work was canceled */ + if (cancel_work_sync(&work->work)) { mmdrop(work->mm); kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ kmem_cache_free(async_pf_cache, work); @@ -114,8 +110,6 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) list_entry(vcpu->async_pf.done.next, typeof(*work), link); list_del(&work->link); - if (!is_error_page(work->page)) - kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } spin_unlock(&vcpu->async_pf.lock); @@ -135,14 +129,11 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) list_del(&work->link); spin_unlock(&vcpu->async_pf.lock); - if (work->page) - kvm_arch_async_page_ready(vcpu, work); + kvm_arch_async_page_ready(vcpu, work); kvm_arch_async_page_present(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; - if (!is_error_page(work->page)) - kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } } @@ -165,8 +156,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, if (!work) return 0; - work->page = NULL; - work->done = false; + work->wakeup_all = false; work->vcpu = vcpu; work->gva = gva; work->addr = gfn_to_hva(vcpu->kvm, gfn); @@ -206,7 +196,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) if (!work) return -ENOMEM; - work->page = KVM_ERR_PTR_BAD_PAGE; + work->wakeup_all = true; INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 72a130bc448..0df7d4b34df 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -79,7 +79,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) flags = IOMMU_READ; if (!(slot->flags & KVM_MEM_READONLY)) flags |= IOMMU_WRITE; - if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) + if (!kvm->arch.iommu_noncoherent) flags |= IOMMU_CACHE; @@ -103,6 +103,10 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) while ((gfn << PAGE_SHIFT) & (page_size - 1)) page_size >>= 1; + /* Make sure hva is aligned to the page size we want to map */ + while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) + page_size >>= 1; + /* * Pin all pages we are about to map in memory. This is * important because we unmap and unpin in 4kb steps later. @@ -140,6 +144,9 @@ static int kvm_iommu_map_memslots(struct kvm *kvm) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + if (kvm->arch.iommu_noncoherent) + kvm_arch_register_noncoherent_dma(kvm); + idx = srcu_read_lock(&kvm->srcu); slots = kvm_memslots(kvm); @@ -158,7 +165,8 @@ int kvm_assign_device(struct kvm *kvm, { struct pci_dev *pdev = NULL; struct iommu_domain *domain = kvm->arch.iommu_domain; - int r, last_flags; + int r; + bool noncoherent; /* check if iommu exists and in use */ if (!domain) @@ -174,15 +182,13 @@ int kvm_assign_device(struct kvm *kvm, return r; } - last_flags = kvm->arch.iommu_flags; - if (iommu_domain_has_cap(kvm->arch.iommu_domain, - IOMMU_CAP_CACHE_COHERENCY)) - kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY; + noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain, + IOMMU_CAP_CACHE_COHERENCY); /* Check if need to update IOMMU page table for guest memory */ - if ((last_flags ^ kvm->arch.iommu_flags) == - KVM_IOMMU_CACHE_COHERENCY) { + if (noncoherent != kvm->arch.iommu_noncoherent) { kvm_iommu_unmap_memslots(kvm); + kvm->arch.iommu_noncoherent = noncoherent; r = kvm_iommu_map_memslots(kvm); if (r) goto out_unmap; @@ -190,11 +196,7 @@ int kvm_assign_device(struct kvm *kvm, pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; - printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", - assigned_dev->host_segnr, - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); + dev_info(&pdev->dev, "kvm assign device\n"); return 0; out_unmap: @@ -220,11 +222,7 @@ int kvm_deassign_device(struct kvm *kvm, pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; - printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", - assigned_dev->host_segnr, - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); + dev_info(&pdev->dev, "kvm deassign device\n"); return 0; } @@ -336,6 +334,9 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); + if (kvm->arch.iommu_noncoherent) + kvm_arch_unregister_noncoherent_dma(kvm); + return 0; } @@ -350,6 +351,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm) mutex_lock(&kvm->slots_lock); kvm_iommu_unmap_memslots(kvm); kvm->arch.iommu_domain = NULL; + kvm->arch.iommu_noncoherent = false; mutex_unlock(&kvm->slots_lock); iommu_domain_free(domain); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1cf9ccb0101..662f34c3287 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -70,7 +70,8 @@ MODULE_LICENSE("GPL"); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_RAW_SPINLOCK(kvm_lock); +DEFINE_SPINLOCK(kvm_lock); +static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -186,6 +187,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } +EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_reload_remote_mmus(struct kvm *kvm) { @@ -490,9 +492,9 @@ static struct kvm *kvm_create_vm(unsigned long type) if (r) goto out_err; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return kvm; @@ -540,13 +542,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) /* * Free any memory in @free but not in @dont. */ -static void kvm_free_physmem_slot(struct kvm_memory_slot *free, +static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); - kvm_arch_free_memslot(free, dont); + kvm_arch_free_memslot(kvm, free, dont); free->npages = 0; } @@ -557,7 +559,7 @@ void kvm_free_physmem(struct kvm *kvm) struct kvm_memory_slot *memslot; kvm_for_each_memslot(memslot, slots) - kvm_free_physmem_slot(memslot, NULL); + kvm_free_physmem_slot(kvm, memslot, NULL); kfree(kvm->memslots); } @@ -581,9 +583,9 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_del(&kvm->vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); @@ -821,7 +823,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (change == KVM_MR_CREATE) { new.userspace_addr = mem->userspace_addr; - if (kvm_arch_create_memslot(&new, npages)) + if (kvm_arch_create_memslot(kvm, &new, npages)) goto out_free; } @@ -872,6 +874,19 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } + /* actual memory is freed via old in kvm_free_physmem_slot below */ + if (change == KVM_MR_DELETE) { + new.dirty_bitmap = NULL; + memset(&new.arch, 0, sizeof(new.arch)); + } + + old_memslots = install_new_memslots(kvm, slots, &new); + + kvm_arch_commit_memory_region(kvm, mem, &old, change); + + kvm_free_physmem_slot(kvm, &old, &new); + kfree(old_memslots); + /* * IOMMU mapping: New slots need to be mapped. Old slots need to be * un-mapped and re-mapped if their base changes. Since base change @@ -883,29 +898,15 @@ int __kvm_set_memory_region(struct kvm *kvm, */ if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { r = kvm_iommu_map_pages(kvm, &new); - if (r) - goto out_slots; - } - - /* actual memory is freed via old in kvm_free_physmem_slot below */ - if (change == KVM_MR_DELETE) { - new.dirty_bitmap = NULL; - memset(&new.arch, 0, sizeof(new.arch)); + return r; } - old_memslots = install_new_memslots(kvm, slots, &new); - - kvm_arch_commit_memory_region(kvm, mem, &old, change); - - kvm_free_physmem_slot(&old, &new); - kfree(old_memslots); - return 0; out_slots: kfree(slots); out_free: - kvm_free_physmem_slot(&new, &old); + kvm_free_physmem_slot(kvm, &new, &old); out: return r; } @@ -964,6 +965,7 @@ int kvm_get_dirty_log(struct kvm *kvm, out: return r; } +EXPORT_SYMBOL_GPL(kvm_get_dirty_log); bool kvm_largepages_enabled(void) { @@ -1654,6 +1656,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } +EXPORT_SYMBOL_GPL(mark_page_dirty); /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. @@ -1679,6 +1682,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); } +EXPORT_SYMBOL_GPL(kvm_vcpu_block); #ifndef CONFIG_S390 /* @@ -2271,6 +2275,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, ops = &kvm_xics_ops; break; #endif +#ifdef CONFIG_KVM_VFIO + case KVM_DEV_TYPE_VFIO: + ops = &kvm_vfio_ops; + break; +#endif default: return -ENODEV; } @@ -2519,44 +2528,12 @@ out: } #endif -static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page[1]; - unsigned long addr; - int npages; - gfn_t gfn = vmf->pgoff; - struct kvm *kvm = vma->vm_file->private_data; - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return VM_FAULT_SIGBUS; - - npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, - NULL); - if (unlikely(npages != 1)) - return VM_FAULT_SIGBUS; - - vmf->page = page[0]; - return 0; -} - -static const struct vm_operations_struct kvm_vm_vm_ops = { - .fault = kvm_vm_fault, -}; - -static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_vm_vm_ops; - return 0; -} - static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = kvm_vm_compat_ioctl, #endif - .mmap = kvm_vm_mmap, .llseek = noop_llseek, }; @@ -2683,11 +2660,12 @@ static void hardware_enable_nolock(void *junk) } } -static void hardware_enable(void *junk) +static void hardware_enable(void) { - raw_spin_lock(&kvm_lock); - hardware_enable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_enable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_nolock(void *junk) @@ -2700,11 +2678,12 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(NULL); } -static void hardware_disable(void *junk) +static void hardware_disable(void) { - raw_spin_lock(&kvm_lock); - hardware_disable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_disable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_all_nolock(void) @@ -2718,16 +2697,16 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); } static int hardware_enable_all(void) { int r = 0; - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -2740,7 +2719,7 @@ static int hardware_enable_all(void) } } - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); return r; } @@ -2750,20 +2729,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, { int cpu = (long)v; - if (!kvm_usage_count) - return NOTIFY_OK; - val &= ~CPU_TASKS_FROZEN; switch (val) { case CPU_DYING: printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", cpu); - hardware_disable(NULL); + hardware_disable(); break; case CPU_STARTING: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); - hardware_enable(NULL); + hardware_enable(); break; } return NOTIFY_OK; @@ -3056,10 +3032,10 @@ static int vm_stat_get(void *_offset, u64 *val) struct kvm *kvm; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) *val += *(u32 *)((void *)kvm + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3073,12 +3049,12 @@ static int vcpu_stat_get(void *_offset, u64 *val) int i; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u32 *)((void *)vcpu + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3133,7 +3109,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_lock)); + WARN_ON(raw_spin_is_locked(&kvm_count_lock)); hardware_enable_nolock(NULL); } } diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c new file mode 100644 index 00000000000..ca4260e3503 --- /dev/null +++ b/virt/kvm/vfio.c @@ -0,0 +1,264 @@ +/* + * VFIO-KVM bridge pseudo device + * + * Copyright (C) 2013 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> + +struct kvm_vfio_group { + struct list_head node; + struct vfio_group *vfio_group; +}; + +struct kvm_vfio { + struct list_head group_list; + struct mutex lock; + bool noncoherent; +}; + +static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) +{ + struct vfio_group *vfio_group; + struct vfio_group *(*fn)(struct file *); + + fn = symbol_get(vfio_group_get_external_user); + if (!fn) + return ERR_PTR(-EINVAL); + + vfio_group = fn(filep); + + symbol_put(vfio_group_get_external_user); + + return vfio_group; +} + +static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) +{ + void (*fn)(struct vfio_group *); + + fn = symbol_get(vfio_group_put_external_user); + if (!fn) + return; + + fn(vfio_group); + + symbol_put(vfio_group_put_external_user); +} + +/* + * Groups can use the same or different IOMMU domains. If the same then + * adding a new group may change the coherency of groups we've previously + * been told about. We don't want to care about any of that so we retest + * each group and bail as soon as we find one that's noncoherent. This + * means we only ever [un]register_noncoherent_dma once for the whole device. + */ +static void kvm_vfio_update_coherency(struct kvm_device *dev) +{ + struct kvm_vfio *kv = dev->private; + bool noncoherent = false; + struct kvm_vfio_group *kvg; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + /* + * TODO: We need an interface to check the coherency of + * the IOMMU domain this group is using. For now, assume + * it's always noncoherent. + */ + noncoherent = true; + break; + } + + if (noncoherent != kv->noncoherent) { + kv->noncoherent = noncoherent; + + if (kv->noncoherent) + kvm_arch_register_noncoherent_dma(dev->kvm); + else + kvm_arch_unregister_noncoherent_dma(dev->kvm); + } + + mutex_unlock(&kv->lock); +} + +static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) +{ + struct kvm_vfio *kv = dev->private; + struct vfio_group *vfio_group; + struct kvm_vfio_group *kvg; + void __user *argp = (void __user *)arg; + struct fd f; + int32_t fd; + int ret; + + switch (attr) { + case KVM_DEV_VFIO_GROUP_ADD: + if (get_user(fd, (int32_t __user *)argp)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group == vfio_group) { + mutex_unlock(&kv->lock); + kvm_vfio_group_put_external_user(vfio_group); + return -EEXIST; + } + } + + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); + if (!kvg) { + mutex_unlock(&kv->lock); + kvm_vfio_group_put_external_user(vfio_group); + return -ENOMEM; + } + + list_add_tail(&kvg->node, &kv->group_list); + kvg->vfio_group = vfio_group; + + mutex_unlock(&kv->lock); + + kvm_vfio_update_coherency(dev); + + return 0; + + case KVM_DEV_VFIO_GROUP_DEL: + if (get_user(fd, (int32_t __user *)argp)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group != vfio_group) + continue; + + list_del(&kvg->node); + kvm_vfio_group_put_external_user(kvg->vfio_group); + kfree(kvg); + ret = 0; + break; + } + + mutex_unlock(&kv->lock); + + kvm_vfio_group_put_external_user(vfio_group); + + kvm_vfio_update_coherency(dev); + + return ret; + } + + return -ENXIO; +} + +static int kvm_vfio_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_VFIO_GROUP: + return kvm_vfio_set_group(dev, attr->attr, attr->addr); + } + + return -ENXIO; +} + +static int kvm_vfio_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_VFIO_GROUP: + switch (attr->attr) { + case KVM_DEV_VFIO_GROUP_ADD: + case KVM_DEV_VFIO_GROUP_DEL: + return 0; + } + + break; + } + + return -ENXIO; +} + +static void kvm_vfio_destroy(struct kvm_device *dev) +{ + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_group *kvg, *tmp; + + list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { + kvm_vfio_group_put_external_user(kvg->vfio_group); + list_del(&kvg->node); + kfree(kvg); + } + + kvm_vfio_update_coherency(dev); + + kfree(kv); + kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ +} + +static int kvm_vfio_create(struct kvm_device *dev, u32 type) +{ + struct kvm_device *tmp; + struct kvm_vfio *kv; + + /* Only one VFIO "device" per VM */ + list_for_each_entry(tmp, &dev->kvm->devices, vm_node) + if (tmp->ops == &kvm_vfio_ops) + return -EBUSY; + + kv = kzalloc(sizeof(*kv), GFP_KERNEL); + if (!kv) + return -ENOMEM; + + INIT_LIST_HEAD(&kv->group_list); + mutex_init(&kv->lock); + + dev->private = kv; + + return 0; +} + +struct kvm_device_ops kvm_vfio_ops = { + .name = "kvm-vfio", + .create = kvm_vfio_create, + .destroy = kvm_vfio_destroy, + .set_attr = kvm_vfio_set_attr, + .has_attr = kvm_vfio_has_attr, +}; |