From 151766fce8bee0e3e6076c8b829f9fcc0a2412ae Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Sat, 28 Apr 2012 14:30:40 +0800 Subject: Revert "x86/platform: Add a wallclock_init func to x86_platforms ops" This reverts commit cf8ff6b6ab0e99dd3058852f4ec76a6140abadec. Just found this commit is a function duplicatation of commit 6b617e22 "x86/platform: Add a wallclock_init func to x86_init.timers ops". Let's revert it and sorry for the noise. Signed-off-by: Feng Tang Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Jacob Pan Cc: Alan Cox Cc: Dirk Brandewie Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/kernel/setup.c | 2 -- arch/x86/kernel/x86_init.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c090af10ac7..42d2ae18dab 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -156,7 +156,6 @@ struct x86_cpuinit_ops { /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC - * @wallclock_init: init the wallclock device * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic @@ -167,7 +166,6 @@ struct x86_cpuinit_ops { */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); - void (*wallclock_init)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 366c688d619..58a07b10812 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1027,8 +1027,6 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); - x86_platform.wallclock_init(); - mcheck_init(); arch_init_ideal_nops(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 35c5e543f55..9f3167e891e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -29,7 +29,6 @@ void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } -void wallclock_init_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -101,7 +100,6 @@ static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, - .wallclock_init = wallclock_init_noop, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, -- cgit v1.2.3-70-g09d2 From f841d792e38f75f5e25b0b66f7b5d235d180a735 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 30 Mar 2012 23:11:35 +0800 Subject: x86: Return IRQ_SET_MASK_OK_NOCOPY from irq affinity functions The interrupt chip irq_set_affinity() functions copy the affinity mask to irq_data->affinity but return 0, i.e. IRQ_SET_MASK_OK. IRQ_SET_MASK_OK causes the core code to do another redundant copy. Return IRQ_SET_MASK_OK_NOCOPY to avoid this. Signed-off-by: Jiang Liu Cc: Suresh Siddha Cc: Yinghai Lu Cc: Naga Chumbalkar Cc: Jacob Pan Cc: Cliff Wickman Cc: Jiang Liu Cc: Keping Chen Link: http://lkml.kernel.org/r/1333120296-13563-4-git-send-email-jiang.liu@huawei.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 9 +++++---- arch/x86/platform/uv/uv_irq.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a9..bce2001b264 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2270,6 +2270,7 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, data->chip_data); + ret = IRQ_SET_MASK_OK_NOCOPY; } raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; @@ -3092,7 +3093,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) __write_msi_msg(data->msi_desc, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif /* CONFIG_SMP */ @@ -3214,7 +3215,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, dmar_msi_write(irq, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif /* CONFIG_SMP */ @@ -3267,7 +3268,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, hpet_msi_write(data->handler_data, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif /* CONFIG_SMP */ @@ -3340,7 +3341,7 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return -1; target_ht_irq(data->irq, dest, cfg->vector); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } #endif diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index f25c2765a5c..a22c41656b5 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -222,7 +222,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, if (cfg->move_in_progress) send_cleanup_vector(cfg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } /* -- cgit v1.2.3-70-g09d2 From c1a7b32a14138f908df52d7c53b5ce3415ec6b50 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 20 May 2012 13:15:07 +0900 Subject: KVM: Avoid wasting pages for small lpage_info arrays lpage_info is created for each large level even when the memory slot is not for RAM. This means that when we add one slot for a PCI device, we end up allocating at least KVM_NR_PAGE_SIZES - 1 pages by vmalloc(). To make things worse, there is an increasing number of devices which would result in more pages being wasted this way. This patch mitigates this problem by using kvm_kvzalloc(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 3 +++ virt/kvm/kvm_main.c | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa..f12a52408cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6304,7 +6304,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - vfree(free->arch.lpage_info[i]); + kvm_kvfree(free->arch.lpage_info[i]); free->arch.lpage_info[i] = NULL; } } @@ -6323,7 +6323,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) slot->base_gfn, level) + 1; slot->arch.lpage_info[i] = - vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); if (!slot->arch.lpage_info[i]) goto out_free; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c4464356b35..19b83f6efa4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -535,6 +535,9 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); +void *kvm_kvzalloc(unsigned long size); +void kvm_kvfree(const void *addr); + #ifndef __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1148c96a481..02cb440f802 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -520,7 +520,7 @@ out_err_nodisable: * Avoid using vmalloc for a small buffer. * Should not be used when the size is statically known. */ -static void *kvm_kvzalloc(unsigned long size) +void *kvm_kvzalloc(unsigned long size) { if (size > PAGE_SIZE) return vzalloc(size); @@ -528,7 +528,7 @@ static void *kvm_kvzalloc(unsigned long size) return kzalloc(size, GFP_KERNEL); } -static void kvm_kvfree(const void *addr) +void kvm_kvfree(const void *addr) { if (is_vmalloc_addr(addr)) vfree(addr); -- cgit v1.2.3-70-g09d2 From aaf07bc291c828189ae5221b370905a89bbb859d Mon Sep 17 00:00:00 2001 From: Xudong Hao Date: Mon, 28 May 2012 19:33:34 +0800 Subject: KVM: VMX: Add EPT A/D bits definitions Signed-off-by: Haitao Shan Signed-off-by: Xudong Hao Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 31f180c21ce..de007c27273 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -404,6 +404,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -415,11 +416,14 @@ enum vmcs_field { #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 #define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) #define VMX_EPT_DEFAULT_MT 0x6ull #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul -- cgit v1.2.3-70-g09d2 From 83c3a3312235220476d3c207f67bd17be6e17ff9 Mon Sep 17 00:00:00 2001 From: Xudong Hao Date: Mon, 28 May 2012 19:33:35 +0800 Subject: KVM: VMX: Add parameter to control A/D bits support, default is on Add kernel parameter to control A/D bits support, it's on by default. Signed-off-by: Haitao Shan Signed-off-by: Xudong Hao Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32eb5886629..18590e003bd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,6 +71,9 @@ static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + static bool __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); @@ -789,6 +792,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + static inline bool cpu_has_vmx_invept_individual_addr(void) { return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -2645,8 +2653,12 @@ static __init int hardware_setup(void) !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; } + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; -- cgit v1.2.3-70-g09d2 From b38f99347871d7fc49e6367395dce0d757f6ab8d Mon Sep 17 00:00:00 2001 From: Xudong Hao Date: Mon, 28 May 2012 19:33:36 +0800 Subject: KVM: VMX: Enable EPT A/D bits if supported by turning on relevant bit in EPTP In EPT page structure entry, Enable EPT A/D bits if processor supported. Signed-off-by: Haitao Shan Signed-off-by: Xudong Hao Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 18590e003bd..d392e5427ca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3039,6 +3039,8 @@ static u64 construct_eptp(unsigned long root_hpa) /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + if (enable_ept_ad_bits) + eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); return eptp; -- cgit v1.2.3-70-g09d2 From 3f6d8c8a478dd1ab2a4944b0d65474df06ecd882 Mon Sep 17 00:00:00 2001 From: Xudong Hao Date: Tue, 22 May 2012 11:23:15 +0800 Subject: KVM: VMX: Use EPT Access bit in response to memory notifiers Signed-off-by: Haitao Shan Signed-off-by: Xudong Hao Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 14 ++++++++------ arch/x86/kvm/vmx.c | 6 ++++-- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be3cea4407f..d07e436b7a4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1242,7 +1242,8 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int young = 0; /* - * Emulate the accessed bit for EPT, by checking if this page has + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has * an EPT mapping, and clearing it if it does. On the next access, * a new EPT mapping will be established. * This has some overhead, but not as much as the cost of swapping @@ -1253,11 +1254,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); } } @@ -1281,9 +1283,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; break; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d392e5427ca..396148ab089 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7289,8 +7289,10 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else -- cgit v1.2.3-70-g09d2 From 1952639665e92481c34c34c3e2a71bf3e66ba362 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 4 Jun 2012 14:53:23 +0300 Subject: KVM: MMU: do not iterate over all VMs in mmu_shrink() mmu_shrink() needlessly iterates over all VMs even though it will not attempt to free mmu pages from more than one on them. Fix that and also check used mmu pages count outside of VM lock to skip inactive VMs faster. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d07e436b7a4..1ca7164a74f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3944,7 +3944,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; - struct kvm *kvm_freed = NULL; int nr_to_scan = sc->nr_to_scan; if (nr_to_scan == 0) @@ -3956,22 +3955,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * n_used_mmu_pages is accessed without holding kvm->mmu_lock + * here. We may skip a VM instance errorneosly, but we do not + * want to shrink a VM that only started to populate its MMU + * anyway. + */ + if (kvm->arch.n_used_mmu_pages > 0) { + if (!nr_to_scan--) + break; + continue; + } + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); - kvm_freed = kvm; - } - nr_to_scan--; + kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + + list_move_tail(&kvm->vm_list, &vm_list); + break; } - if (kvm_freed) - list_move_tail(&kvm_freed->vm_list, &vm_list); raw_spin_unlock(&kvm_lock); -- cgit v1.2.3-70-g09d2 From b246dd5df139501b974bd6b28f7815e53b3a792f Mon Sep 17 00:00:00 2001 From: Orit Wasserman Date: Thu, 31 May 2012 14:49:22 +0300 Subject: KVM: VMX: Fix KVM_SET_SREGS with big real mode segments For example migration between Westmere and Nehelem hosts, caught in big real mode. The code that fixes the segments for real mode guest was moved from enter_rmode to vmx_set_segments. enter_rmode calls vmx_set_segments for each segment. Signed-off-by: Orit Wasserman Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 70 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 396148ab089..f78662ec867 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -618,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -2782,6 +2786,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_segment var; if (enable_unrestricted_guest) return; @@ -2825,20 +2830,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) goto continue_rmode; - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + vmx_get_segment(vcpu, &var, VCPU_SREG_SS); + vmx_set_segment(vcpu, &var, VCPU_SREG_SS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_CS); + vmx_set_segment(vcpu, &var, VCPU_SREG_CS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_ES); + vmx_set_segment(vcpu, &var, VCPU_SREG_ES); - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + vmx_get_segment(vcpu, &var, VCPU_SREG_DS); + vmx_set_segment(vcpu, &var, VCPU_SREG_DS); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_get_segment(vcpu, &var, VCPU_SREG_GS); + vmx_set_segment(vcpu, &var, VCPU_SREG_GS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_FS); + vmx_set_segment(vcpu, &var, VCPU_SREG_FS); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -3243,6 +3251,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, ar); __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + + /* + * Fix segments for real mode guest in hosts that don't have + * "unrestricted_mode" or it was disabled. + * This is done to allow migration of the guests from hosts with + * unrestricted guest like Westmere to older host that don't have + * unrestricted guest like Nehelem. + */ + if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { + switch (seg) { + case VCPU_SREG_CS: + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_readl(GUEST_CS_BASE) >> 4); + break; + case VCPU_SREG_ES: + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + break; + case VCPU_SREG_DS: + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + break; + case VCPU_SREG_GS: + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + break; + case VCPU_SREG_FS: + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + break; + case VCPU_SREG_SS: + vmcs_write16(GUEST_SS_SELECTOR, + vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + break; + } + } } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -- cgit v1.2.3-70-g09d2 From 3b6f70fd7dd4e19fc674ec99e389bf0da5589525 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:16:01 +0800 Subject: x86-smp-remove-call-to-ipi_call_lock-ipi_call_unlock ipi_call_lock/unlock() lock resp. unlock call_function.lock. This lock protects only the call_function data structure itself, but it's completely unrelated to cpu_online_mask. The mask to which the IPIs are sent is calculated before call_function.lock is taken in smp_call_function_many(), so the locking around set_cpu_online() is pointless and can be removed. [ tglx: Massaged changelog ] Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: peterz@infradead.org Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1338275765-3217-7-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 9 --------- arch/x86/xen/smp.c | 2 -- 2 files changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f56f96da77f..b2fd28ff84b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,22 +255,13 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI recipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - * * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock(); lock_vector_lock(); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); - ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index afb250d22a6..f58dca7a6e5 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); this_cpu_write(cpu_state, CPU_ONLINE); -- cgit v1.2.3-70-g09d2 From 43cc7e86f3200b094e2960b732623aeec00b482d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 May 2012 17:26:16 +0200 Subject: smp: Remove num_booting_cpus() No users. Signed-off-by: Thomas Gleixner Cc: Srivatsa S. Bhat Cc: Rusty Russell --- arch/m32r/include/asm/smp.h | 5 ----- arch/x86/include/asm/smp.h | 5 ----- include/linux/smp.h | 1 - 3 files changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/m32r/include/asm/smp.h b/arch/m32r/include/asm/smp.h index cf7829a6155..c689b828dfe 100644 --- a/arch/m32r/include/asm/smp.h +++ b/arch/m32r/include/asm/smp.h @@ -79,11 +79,6 @@ static __inline__ int cpu_number_map(int cpu) return cpu; } -static __inline__ unsigned int num_booting_cpus(void) -{ - return cpumask_weight(&cpu_callout_map); -} - extern void smp_send_timer(void); extern unsigned long send_IPI_mask_phys(const cpumask_t*, int, int); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f48394513c3..2ffa95dc233 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) -/* We don't mark CPUs online until __cpu_up(), so we need another measure */ -static inline int num_booting_cpus(void) -{ - return cpumask_weight(cpu_callout_mask); -} #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() static inline int wbinvd_on_all_cpus(void) diff --git a/include/linux/smp.h b/include/linux/smp.h index a34d4f15430..dd6f06be3c9 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -177,7 +177,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) } while (0) static inline void smp_send_reschedule(int cpu) { } -#define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) -- cgit v1.2.3-70-g09d2 From 7db971b235480849aa5b9209b67b62e987b3181b Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Sun, 3 Jun 2012 01:11:34 +0300 Subject: x86/platform: Introduce APIC post-initialization callback Some subarchitectures (such as vSMP) need to slightly adjust the underlying APIC structure. Add an APIC post-initialization callback to 'struct x86_platform_ops' for this purpose and use it for adjusting the APIC structure on vSMP systems. Signed-off-by: Ido Yariv Acked-by: Shai Fultheim Link: http://lkml.kernel.org/r/1338675095-27260-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/probe_32.c | 3 +++ arch/x86/kernel/apic/probe_64.c | 11 ++--------- arch/x86/kernel/vsmp_64.c | 13 +++++++++++++ 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c090af10ac7..c377d9ccb69 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -164,6 +164,7 @@ struct x86_cpuinit_ops { * @i8042_detect pre-detect if i8042 controller exists * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume + * @apic_post_init: adjust apic if neeeded */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -177,6 +178,7 @@ struct x86_platform_ops { int (*i8042_detect)(void); void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); + void (*apic_post_init)(void); }; struct pci_dev; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1b291da09e6..8616d5198e1 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -208,6 +208,9 @@ void __init default_setup_apic_routing(void) if (apic->setup_apic_routing) apic->setup_apic_routing(); + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 3fe98669892..1793dba7a74 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,11 +23,6 @@ #include #include -static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ @@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void) } } - if (is_vsmp_box()) { - /* need to update phys_pkg_id */ - apic->phys_pkg_id = apicid_phys_pkg_id; - } + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 8eeb55a551b..59eea855f45 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -187,12 +187,25 @@ static void __init vsmp_cap_cpus(void) #endif } +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +static void vsmp_apic_post_init(void) +{ + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; +} + void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; + x86_platform.apic_post_init = vsmp_apic_post_init; + vsmp_cap_cpus(); set_vsmp_pv_ops(); -- cgit v1.2.3-70-g09d2 From c767a54ba0657e52e6edaa97cbe0b0a8bf1c1655 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 21 May 2012 19:50:07 -0700 Subject: x86/debug: Add KERN_ to bare printks, convert printks to pr_ Use a more current logging style: - Bare printks should have a KERN_ for consistency's sake - Add pr_fmt where appropriate - Neaten some macro definitions - Convert some Ok output to OK - Use "%s: ", __func__ in pr_fmt for summit - Convert some printks to pr_ Message output is not identical in all cases. Signed-off-by: Joe Perches Cc: levinsasha928@gmail.com Link: http://lkml.kernel.org/r/1337655007.24226.10.camel@joe2Laptop [ merged two similar patches, tidied up the changelog ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/floppy.h | 2 +- arch/x86/include/asm/pci_x86.h | 8 ++- arch/x86/include/asm/pgtable-2level.h | 4 +- arch/x86/include/asm/pgtable-3level.h | 6 +-- arch/x86/include/asm/pgtable_64.h | 8 +-- arch/x86/kernel/alternative.c | 17 +++--- arch/x86/kernel/amd_nb.c | 10 ++-- arch/x86/kernel/apic/io_apic.c | 38 ++++++------- arch/x86/kernel/apic/summit_32.c | 22 ++++---- arch/x86/kernel/apm_32.c | 29 +++++----- arch/x86/kernel/cpu/bugs.c | 20 +++---- arch/x86/kernel/cpu/mcheck/mce.c | 22 ++++---- arch/x86/kernel/cpu/perf_event_intel.c | 14 ++--- arch/x86/kernel/dumpstack.c | 4 +- arch/x86/kernel/dumpstack_32.c | 24 ++++----- arch/x86/kernel/dumpstack_64.c | 20 +++---- arch/x86/kernel/irq.c | 4 +- arch/x86/kernel/module.c | 32 ++++++----- arch/x86/kernel/pci-calgary_64.c | 34 ++++++------ arch/x86/kernel/process.c | 34 ++++++------ arch/x86/kernel/process_64.c | 8 +-- arch/x86/kernel/reboot.c | 14 +++-- arch/x86/kernel/signal.c | 5 +- arch/x86/kernel/smpboot.c | 97 ++++++++++++++++------------------ arch/x86/kernel/traps.c | 19 +++---- arch/x86/kernel/tsc.c | 50 +++++++++--------- arch/x86/kernel/vm86_32.c | 6 ++- arch/x86/kernel/vsyscall_64.c | 17 +++--- arch/x86/kernel/xsave.c | 12 +++-- 29 files changed, 301 insertions(+), 279 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index dbe82a5c5ea..d3d74698dce 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) virtual_dma_residue += virtual_dma_count; virtual_dma_count = 0; #ifdef TRACE_FLPY_INT - printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", + printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", virtual_dma_count, virtual_dma_residue, calls, bytes, dma_wait); calls = 0; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b3a53174602..5ad24a89b19 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -7,9 +7,13 @@ #undef DEBUG #ifdef DEBUG -#define DBG(x...) printk(x) +#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define DBG(x...) +#define DBG(fmt, ...) \ +do { \ + if (0) \ + printk(fmt, ##__VA_ARGS__); \ +} while (0) #endif #define PCI_PROBE_BIOS 0x0001 diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 98391db840c..f2b489cf160 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -2,9 +2,9 @@ #define _ASM_X86_PGTABLE_2LEVEL_H #define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) + pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e)) /* * Certain architectures need to do special things when PTEs diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 43876f16caf..f824cfbaa9d 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -9,13 +9,13 @@ */ #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ + pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \ __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) /* Rules for using set_pte: the pte being assigned *must* be diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09a..8251be02301 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx).\n", \ + pr_err("%s:%d: bad pte %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pte_val(e)) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pud %p(%016lx).\n", \ + pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) struct mm_struct; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f075..1729d720299 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "SMP alternatives: " fmt + #include #include #include @@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, args...) if (debug_alternative) \ - printk(KERN_DEBUG fmt, args) +#define DPRINTK(fmt, ...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) /* * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes @@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp) * If this still occurs then you should see a hang * or crash shortly after this line: */ - printk("lockdep: fixing up alternatives.\n"); + pr_info("lockdep: fixing up alternatives\n"); #endif if (noreplace_smp || smp_alt_once || skip_smp_alternatives) @@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp) if (smp == smp_mode) { /* nothing */ } else if (smp) { - printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); + pr_info("switching to SMP code\n"); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) @@ -546,7 +551,7 @@ void __init alternative_instructions(void) #ifdef CONFIG_SMP if (smp_alt_once) { if (1 == num_possible_cpus()) { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854591c..f29f6dd6bc0 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -2,6 +2,9 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -258,7 +261,7 @@ void amd_flush_garts(void) } spin_unlock_irqrestore(&gart_lock, flags); if (!flushed) - printk("nothing to flush?\n"); + pr_notice("nothing to flush?\n"); } EXPORT_SYMBOL_GPL(amd_flush_garts); @@ -269,11 +272,10 @@ static __init int init_amd_nbs(void) err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + pr_notice("Cannot enumerate AMD northbridges\n"); if (amd_cache_gart() < 0) - printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " - "GART support disabled.\n"); + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return err; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a9..5155d6f806f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi entry = alloc_irq_pin_list(node); if (!entry) { - printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); return -ENOMEM; } entry->apic = apic; @@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_mask_entry(apic, pin); entry = ioapic_read_entry(apic, pin); if (entry.irr) - printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", mpc_ioapic_id(apic), pin); } @@ -895,7 +895,7 @@ static int irq_polarity(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -906,7 +906,7 @@ static int irq_polarity(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -948,7 +948,7 @@ static int irq_trigger(int idx) } default: { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -962,7 +962,7 @@ static int irq_trigger(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -973,7 +973,7 @@ static int irq_trigger(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 0; break; } @@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; @@ -1521,7 +1521,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) reg_03.raw = io_apic_read(ioapic_idx, 3); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1578,7 +1577,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, ir_entry->index ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %X %02X\n", ir_entry->format, ir_entry->mask, @@ -1598,7 +1597,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, entry.dest ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %02X\n", entry.mask, entry.trigger, @@ -1651,8 +1650,8 @@ __apicdebuginit(void) print_IO_APICs(void) continue; printk(KERN_DEBUG "IRQ%d ", irq); for_each_irq_pin(entry, cfg->irq_2_pin) - printk("-> %d:%d", entry->apic, entry->pin); - printk("\n"); + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); } printk(KERN_INFO ".................................... done.\n"); @@ -1665,9 +1664,9 @@ __apicdebuginit(void) print_APIC_field(int base) printk(KERN_DEBUG); for (i = 0; i < 8; i++) - printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + pr_cont("%08x", apic_read(base + i*0x10)); - printk(KERN_CONT "\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1769,7 +1768,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); } } - printk("\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APICs(int maxcpu) @@ -2065,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(ioapic_idx, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) - printk("could not set ID!\n"); + pr_cont("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); } @@ -3563,7 +3562,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id) /* Sanity check */ if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); return -1; } } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 659897c0075..e97d542ccdd 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "summit: %s: " fmt, __func__ + #include #include #include @@ -235,8 +237,8 @@ static int summit_apic_id_registered(void) static void summit_setup_apic_routing(void) { - printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", - nr_ioapics); + pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", + nr_ioapics); } static int summit_cpu_present_to_apicid(int mps_cpu) @@ -275,7 +277,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - printk("%s: Not a valid mask!\n", __func__); + pr_err("Not a valid mask!\n"); return BAD_APICID; } apicid |= new_apicid; @@ -355,7 +357,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_rio_dev) { - printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); + pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); return last_bus; } @@ -366,7 +368,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_scal_dev) { - printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); + pr_err("Couldn't find owner Twister for Cyclone!\n"); return last_bus; } @@ -396,7 +398,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) num_buses = 9; break; default: - printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); + pr_info("Unsupported Winnipeg type!\n"); return last_bus; } @@ -411,13 +413,15 @@ static int build_detail_arrays(void) int i, scal_detail_size, rio_detail_size; if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { - printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); + pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", + MAX_NUMNODES, rio_table_hdr->num_scal_dev); return 0; } switch (rio_table_hdr->version) { default: - printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); + pr_warn("Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); return 0; case 2: scal_detail_size = 11; @@ -462,7 +466,7 @@ void setup_summit(void) offset = *((unsigned short *)(ptr + offset)); } if (!rio_table_hdr) { - printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); + pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); return; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 07b0c0db466..d65464e4350 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -201,6 +201,8 @@ * http://www.microsoft.com/whdc/archive/amp_12.mspx] */ +#define pr_fmt(fmt) "apm: " fmt + #include #include @@ -485,11 +487,11 @@ static void apm_error(char *str, int err) if (error_table[i].key == err) break; if (i < ERROR_COUNT) - printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + pr_notice("%s: %s\n", str, error_table[i].msg); else if (err < 0) - printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); + pr_notice("%s: linux error code %i\n", str, err); else - printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + pr_notice("%s: unknown error code %#2.2x\n", str, err); } @@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) static int notified; if (notified++ == 0) - printk(KERN_ERR "apm: an event queue overflowed\n"); + pr_err("an event queue overflowed\n"); if (++as->event_tail >= APM_MAX_EVENTS) as->event_tail = 0; } @@ -1447,7 +1449,7 @@ static void apm_mainloop(void) static int check_apm_user(struct apm_user *as, const char *func) { if (as == NULL || as->magic != APM_BIOS_MAGIC) { - printk(KERN_ERR "apm: %s passed bad filp\n", func); + pr_err("%s passed bad filp\n", func); return 1; } return 0; @@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp) as1 = as1->next) ; if (as1 == NULL) - printk(KERN_ERR "apm: filp not in user list\n"); + pr_err("filp not in user list\n"); else as1->next = as->next; } @@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp) struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); - if (as == NULL) { - printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", - sizeof(*as)); + if (as == NULL) return -ENOMEM; - } + as->magic = APM_BIOS_MAGIC; as->event_tail = as->event_head = 0; as->suspends_pending = as->standbys_pending = 0; @@ -2313,16 +2313,16 @@ static int __init apm_init(void) } if (apm_info.disabled) { - printk(KERN_NOTICE "apm: disabled on user request.\n"); + pr_notice("disabled on user request.\n"); return -ENODEV; } if ((num_online_cpus() > 1) && !power_off && !smp) { - printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + pr_notice("disabled - APM is not SMP safe.\n"); apm_info.disabled = 1; return -ENODEV; } if (!acpi_disabled) { - printk(KERN_NOTICE "apm: overridden by ACPI.\n"); + pr_notice("overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } @@ -2356,8 +2356,7 @@ static int __init apm_init(void) kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { - printk(KERN_ERR "apm: disabled - Unable to start kernel " - "thread.\n"); + pr_err("disabled - Unable to start kernel thread\n"); err = PTR_ERR(kapmd_task); kapmd_task = NULL; remove_proc_entry("apm", NULL); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fbb62b..c97bb7b5a9f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -55,8 +55,8 @@ static void __init check_fpu(void) if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION - printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); - printk(KERN_EMERG "Giving up.\n"); + pr_emerg("No coprocessor found and no math emulation present\n"); + pr_emerg("Giving up\n"); for (;;) ; #endif return; @@ -86,7 +86,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); + pr_warn("Hmm, FPU with FDIV bug\n"); } static void __init check_hlt(void) @@ -94,16 +94,16 @@ static void __init check_hlt(void) if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; - printk(KERN_INFO "Checking 'hlt' instruction... "); + pr_info("Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { - printk("disabled\n"); + pr_cont("disabled\n"); return; } halt(); halt(); halt(); halt(); - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); } /* @@ -116,7 +116,7 @@ static void __init check_popad(void) #ifndef CONFIG_X86_POPAD_OK int res, inp = (int) &res; - printk(KERN_INFO "Checking for popad bug... "); + pr_info("Checking for popad bug... "); __asm__ __volatile__( "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " : "=&a" (res) @@ -127,9 +127,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk(KERN_CONT "Buggy.\n"); + pr_cont("Buggy\n"); else - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); #endif } @@ -161,7 +161,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0a687fd185e..5623b4b5d51 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,6 +7,9 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -210,7 +213,7 @@ static void drain_mcelog_buffer(void) cpu_relax(); if (!m->finished && retries >= 4) { - pr_err("MCE: skipping error being logged currently!\n"); + pr_err("skipping error being logged currently!\n"); break; } } @@ -1167,8 +1170,9 @@ int memory_failure(unsigned long pfn, int vector, int flags) { /* mce_severity() should not hand us an ACTION_REQUIRED error */ BUG_ON(flags & MF_ACTION_REQUIRED); - printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" - "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); return 0; } @@ -1358,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void) b = cap & MCG_BANKCNT_MASK; if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } @@ -1419,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void) static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } @@ -1574,7 +1577,7 @@ static void __mcheck_cpu_init_timer(void) /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1893,8 +1896,7 @@ static int __init mcheck_enable(char *str) get_option(&str, &monarch_timeout); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6ae..9e3f5d6e3d2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -5,6 +5,8 @@ * among events on a single PMU. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1000,7 +1002,7 @@ static void intel_pmu_reset(void) local_irq_save(flags); - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); @@ -1638,14 +1640,14 @@ static __init void intel_clovertown_quirk(void) * But taken together it might just make sense to not enable PEBS on * these chips. */ - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } static __init void intel_sandybridge_quirk(void) { - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } @@ -1667,8 +1669,8 @@ static __init void intel_arch_events_quirk(void) /* disable event that reported as not presend by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; - printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", - intel_arch_events_map[bit].name); + pr_warn("CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); } } @@ -1687,7 +1689,7 @@ static __init void intel_nehalem_quirk(void) intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; ebx.split.no_branch_misses_retired = 0; x86_pmu.events_maskl = ebx.full; - printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + pr_info("CPU erratum AAJ80 worked around\n"); } } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 571246d81ed..87d3b5d663c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,8 +27,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pB\n", (void *) address, - reliable ? "" : "? ", (void *) address); + pr_cont(" [<%p>] %s%pB\n", + (void *)address, reliable ? "" : "? ", (void *)address); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0b1d783daa..3a8aced11ae 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %08lx", *stack++); + pr_cont("\n"); + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -89,9 +89,9 @@ void show_regs(struct pt_regs *regs) print_modules(); __show_regs(regs, !user_mode_vm(regs)); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); + pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -102,10 +102,10 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + pr_emerg("Stack:\n"); show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - printk(KERN_EMERG "Code: "); + pr_emerg("Code:"); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -116,16 +116,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad EIP value."); + pr_cont(" Bad EIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont(" <%02x>", c); else - printk(KERN_CONT "%02x ", c); + pr_cont(" %02x", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 791b76122aa..c582e9c5bd1 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(KERN_CONT " "); + pr_cont(" "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %016lx", *stack++); + pr_cont("\n"); + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -256,8 +256,8 @@ void show_regs(struct pt_regs *regs) printk("CPU %d ", cpu); print_modules(); __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -284,16 +284,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad RIP value."); + pr_cont(" Bad RIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont("<%02x> ", c); else - printk(KERN_CONT "%02x ", c); + pr_cont("%02x ", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3dafc6003b7..1f5f1d5d2a0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -294,9 +294,9 @@ void fixup_irqs(void) raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); + pr_notice("Broke affinity for irq %i\n", irq); else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); + pr_notice("Cannot set affinity for irq %i\n", irq); } /* diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f21fd94ac89..202494d2ec6 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -15,6 +15,9 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -30,9 +33,14 @@ #include #if 0 -#define DEBUGP printk +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) #else -#define DEBUGP(fmt...) +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) #endif void *module_alloc(unsigned long size) @@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs, Elf32_Sym *sym; uint32_t *location; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -77,7 +85,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value - (uint32_t)location; break; default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", + pr_err("%s: Unknown relocation: %u\n", me->name, ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, void *loc; u64 val; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, + ELF64_R_SYM(rel[i].r_info); DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); val = sym->st_value + rel[i].r_addend; @@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, #endif break; default: - printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + pr_err("%s: Unknown rela relocation: %llu\n", me->name, ELF64_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + pr_err("overflow in relocation type %d val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", me->name); return -ENOEXEC; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index b72838bae64..299d49302e7 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -22,6 +22,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "Calgary: " fmt + #include #include #include @@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev, offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, npages, 0, boundary_size, 0); if (offset == ~0UL) { - printk(KERN_WARNING "Calgary: IOMMU full.\n"); + pr_warn("IOMMU full\n"); spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); @@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); if (unlikely(entry == DMA_ERROR_CODE)) { - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); + pr_warn("failed to allocate %u pages in iommu %p\n", + npages, tbl); return DMA_ERROR_CODE; } @@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl) i++; } while ((val & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "Calgary: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("PCI bus not quiesced, continuing anyway\n"); /* invalidate TCE cache */ target = calgary_reg(bbar, tar_offset(tbl->it_busno)); @@ -604,8 +605,7 @@ begin: i++; } while ((val64 & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); /* 3. poll Page Migration DEBUG for SoftStopFault */ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); @@ -617,8 +617,7 @@ begin: if (++count < 100) goto begin; else { - printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " - "aborting TCE cache flush sequence!\n"); + pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); return; /* pray for the best */ } } @@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl) plssr = be32_to_cpu(readl(target)); /* If no error, the agent ID in the CSR is not valid */ - printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " - "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); + pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", + tbl->it_busno, csr, plssr); } static void calioc2_dump_error_regs(struct iommu_table *tbl) @@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) target = calgary_reg(bbar, phboff | 0x800); mck = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", - tbl->it_busno); + pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); + pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); /* dump rest of error regs */ - printk(KERN_EMERG "Calgary: "); + pr_emerg(""); for (i = 0; i < ARRAY_SIZE(errregs); i++) { /* err regs are at 0x810 - 0x870 */ erroff = (0x810 + (i * 0x10)); target = calgary_reg(bbar, phboff | erroff); errregs[i] = be32_to_cpu(readl(target)); - printk("0x%08x@0x%lx ", errregs[i], erroff); + pr_cont("0x%08x@0x%lx ", errregs[i], erroff); } - printk("\n"); + pr_cont("\n"); /* root complex status */ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 735279e54e5..ef6a8456f71 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -145,16 +147,14 @@ void show_regs_common(void) /* Board Name is optional */ board = dmi_get_system_info(DMI_BOARD_NAME); - printk(KERN_CONT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - printk(KERN_CONT " %s %s", vendor, product); - if (board) - printk(KERN_CONT "/%s", board); - printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, + vendor, product, + board ? "/" : "", + board ? board : ""); } void flush_thread(void) @@ -645,7 +645,7 @@ static void amd_e400_idle(void) amd_e400_c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - printk(KERN_INFO "System has AMD C1E enabled\n"); + pr_info("System has AMD C1E enabled\n"); } } @@ -659,8 +659,7 @@ static void amd_e400_idle(void) */ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); - printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", - cpu); + pr_info("Switch to broadcast mode on CPU%d\n", cpu); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); } #endif if (pm_idle) @@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) /* * One CPU supports mwait => All CPUs supports mwait */ - printk(KERN_INFO "using mwait in idle threads.\n"); + pr_info("using mwait in idle threads\n"); pm_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ - printk(KERN_INFO "using AMD E400 aware idle routine\n"); + pr_info("using AMD E400 aware idle routine\n"); pm_idle = amd_e400_idle; } else pm_idle = default_idle; @@ -715,7 +713,7 @@ static int __init idle_setup(char *str) return -EINVAL; if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); + pr_info("using polling idle threads\n"); pm_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7fdf09..85151f3e36e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); + pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); BUG(); } } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af8160..ab3f0626071 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -152,7 +154,8 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_BIOS) { reboot_type = BOOT_BIOS; - printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "BIOS", d->ident); } return 0; } @@ -207,8 +210,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_CF9) { reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "PCI", d->ident); } return 0; } @@ -217,7 +220,8 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { reboot_type = BOOT_KBD; - printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + "KBD", d->ident); } return 0; } @@ -668,7 +672,7 @@ static void __machine_emergency_restart(int emergency) static void native_machine_restart(char *__unused) { - printk("machine restart\n"); + pr_notice("machine restart\n"); if (!reboot_force) machine_shutdown(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 21af737053a..b280908a376 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,6 +6,9 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } force_sig(SIGSEGV, me); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd019d78b1f..456d64806c8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1,4 +1,4 @@ -/* + /* * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 @@ -39,6 +39,8 @@ * Glauber Costa : i386 and x86_64 integration */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void) * boards) */ - pr_debug("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC()\n"); if (apic->smp_callin_clear_local_apic) apic->smp_callin_clear_local_apic(); setup_local_APIC(); @@ -420,17 +422,16 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - pr_debug("Before bogomips.\n"); + pr_debug("Before bogomips\n"); for_each_possible_cpu(cpu) if (cpumask_test_cpu(cpu, cpu_callout_mask)) bogosum += cpu_data(cpu).loops_per_jiffy; - printk(KERN_INFO - "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - pr_debug("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1\n"); } void __inquire_remote_apic(int apicid) @@ -440,18 +441,17 @@ void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); + pr_info("Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); + pr_info("... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk(KERN_CONT - "a previous APIC delivery may have failed\n"); + pr_cont("a previous APIC delivery may have failed\n"); apic_icr_write(APIC_DM_REMRD | regs[i], apicid); @@ -464,10 +464,10 @@ void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk(KERN_CONT "%08x\n", status); + pr_cont("%08x\n", status); break; default: - printk(KERN_CONT "failed\n"); + pr_cont("failed\n"); } } } @@ -501,12 +501,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); } - pr_debug("NMI sent.\n"); + pr_debug("NMI sent\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -528,7 +528,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_read(APIC_ESR); } - pr_debug("Asserting INIT.\n"); + pr_debug("Asserting INIT\n"); /* * Turn INIT on target chip @@ -544,7 +544,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) mdelay(10); - pr_debug("Deasserting INIT.\n"); + pr_debug("Deasserting INIT\n"); /* Target chip */ /* Send IPI */ @@ -577,14 +577,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - pr_debug("#startup loops: %d.\n", num_starts); + pr_debug("#startup loops: %d\n", num_starts); for (j = 1; j <= num_starts; j++) { - pr_debug("Sending STARTUP #%d.\n", j); + pr_debug("Sending STARTUP #%d\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - pr_debug("After apic_write.\n"); + pr_debug("After apic_write\n"); /* * STARTUP IPI @@ -601,7 +601,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) */ udelay(300); - pr_debug("Startup point 1.\n"); + pr_debug("Startup point 1\n"); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -616,12 +616,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) if (send_status || accept_status) break; } - pr_debug("After Startup.\n"); + pr_debug("After Startup\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -635,11 +635,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid) if (system_state == SYSTEM_BOOTING) { if (node != current_node) { if (current_node > (-1)) - pr_cont(" Ok.\n"); + pr_cont(" OK\n"); current_node = node; pr_info("Booting Node %3d, Processors ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); return; } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", @@ -719,9 +719,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * allow APs to start initializing. */ - pr_debug("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d\n", cpu); cpumask_set_cpu(cpu, cpu_callout_mask); - pr_debug("After Callout %d.\n", cpu); + pr_debug("After Callout %d\n", cpu); /* * Wait 5s total for a response @@ -749,7 +749,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - pr_err("CPU%d: Not responding.\n", cpu); + pr_err("CPU%d: Not responding\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -794,7 +794,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { - printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); + pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -875,9 +875,8 @@ static int __init smp_sanity_check(unsigned max_cpus) unsigned int cpu; unsigned nr; - printk(KERN_WARNING - "More than 8 CPUs detected - skipping them.\n" - "Use CONFIG_X86_BIGSMP.\n"); + pr_warn("More than 8 CPUs detected - skipping them\n" + "Use CONFIG_X86_BIGSMP\n"); nr = 0; for_each_present_cpu(cpu) { @@ -898,8 +897,7 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING - "weird, boot CPU (#%d) not listed by the BIOS.\n", + pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); @@ -911,11 +909,10 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (!smp_found_config && !acpi_lapic) { preempt_enable(); - printk(KERN_NOTICE "SMP motherboard not detected.\n"); + pr_notice("SMP motherboard not detected\n"); disable_smp(); if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); + pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); return -1; } @@ -924,9 +921,8 @@ static int __init smp_sanity_check(unsigned max_cpus) * CPU too, but we do it for the sake of robustness anyway. */ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - printk(KERN_NOTICE - "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_physical_apicid); + pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", + boot_cpu_physical_apicid); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); @@ -939,8 +935,7 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation." - "(tell your hw vendor)\n"); + pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); } smpboot_clear_io_apic(); disable_ioapic_support(); @@ -953,7 +948,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated.\n"); + pr_info("SMP mode deactivated\n"); smpboot_clear_io_apic(); connect_bsp_APIC(); @@ -1005,7 +1000,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (smp_sanity_check(max_cpus) < 0) { - printk(KERN_INFO "SMP disabled\n"); + pr_info("SMP disabled\n"); disable_smp(); goto out; } @@ -1043,7 +1038,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - printk(KERN_INFO "CPU%d: ", 0); + pr_info("CPU%d: ", 0); print_cpu_info(&cpu_data(0)); x86_init.timers.setup_percpu_clockev(); @@ -1093,7 +1088,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - pr_debug("Boot done.\n"); + pr_debug("Boot done\n"); nmi_selftest(); impress_friends(); @@ -1154,8 +1149,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - printk(KERN_WARNING - "%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } @@ -1164,13 +1158,12 @@ __init void prefill_possible_map(void) if (!setup_max_cpus) #endif if (possible > i) { - printk(KERN_WARNING - "%d Processors exceeds max_cpus limit of %u\n", + pr_warn("%d Processors exceeds max_cpus limit of %u\n", possible, setup_max_cpus); possible = i; } - printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b31d92f69..b481341c936 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -9,6 +9,9 @@ /* * Handle hardware traps and faults. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -143,12 +146,11 @@ trap_signal: #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); + pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } #endif @@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code) if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", + pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } force_sig(SIGSEGV, tsk); @@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); + pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc0a147e372..cfa5d4f7ca5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); + pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); tsc_disabled = 1; return 1; } @@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - printk("Fast TSC calibration failed\n"); + pr_err("Fast TSC calibration failed\n"); return 0; success: @@ -392,7 +393,7 @@ success: */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); - printk("Fast TSC calibration using PIT\n"); + pr_info("Fast TSC calibration using PIT\n"); return delta; } @@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void) * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { - printk(KERN_INFO - "TSC: PIT calibration matches %s. %d loops\n", - hpet ? "HPET" : "PMTIMER", i + 1); + pr_info("PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } @@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void) */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ - printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { - printk("TSC: No reference (HPET/PMTIMER) available\n"); + pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " - "failed.\n"); + pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ - printk(KERN_INFO "TSC: using %s reference calibration\n", - hpet ? "HPET" : "PMTIMER"); + pr_info("using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " - "Using PIT calibration\n"); + pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } @@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void) * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ - printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", - hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_warn("PIT calibration deviates from %s: %lu %lu\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } @@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason) tsc_unstable = 1; sched_clock_stable = 0; disable_sched_clock_irqtime(); - printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); + pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_mark_unstable(&clocksource_tsc); @@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) goto out; tsc_khz = freq; - printk(KERN_INFO "Refined TSC clocksource calibration: " - "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); out: clocksource_register_khz(&clocksource_tsc, tsc_khz); @@ -970,9 +968,9 @@ void __init tsc_init(void) return; } - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + pr_info("Detected %lu.%03lu MHz processor\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); /* * Secondary CPUs do not run through tsc_init(), so set up diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 255f58ae71e..54abcc0baf2 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -28,6 +28,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) local_irq_enable(); if (!current->thread.vm86_info) { - printk("no vm86_info: BAD\n"); + pr_alert("no vm86_info: BAD\n"); do_exit(SIGSEGV); } set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); if (tmp) { - printk("vm86: could not access userspace vm86_info\n"); + pr_alert("could not access userspace vm86_info\n"); do_exit(SIGSEGV); } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 7515cf0e180..acdc125ad44 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,6 +18,8 @@ * use the vDSO. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - struct task_struct *tsk; - - if (!show_unhandled_signals || !__ratelimit(&rs)) + if (!show_unhandled_signals) return; - tsk = current; - - printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, tsk->comm, task_pid_nr(tsk), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index bd18149b2b0..3d3e2070911 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -3,6 +3,9 @@ * * Author: Suresh Siddha */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf) BUG_ON(sig_xstate_size < xstate_size); if ((unsigned long)buf % 64) - printk("save_i387_xstate: bad fpstate %p\n", buf); + pr_err("%s: bad fpstate %p\n", __func__, buf); if (!used_math()) return 0; @@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void) pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pr_err("FP/SSE not shown under xsave features 0x%llx\n", pcntxt_mask); BUG(); } @@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void) setup_xstate_init(); - printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " - "cntxt size 0x%x\n", - pcntxt_mask, xstate_size); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", + pcntxt_mask, xstate_size); } /* -- cgit v1.2.3-70-g09d2 From 332afa656e76458ee9cf0f0d123016a0658539e4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 21 May 2012 16:58:01 -0700 Subject: x86/irq: Update irq_cfg domain unless the new affinity is a subset of the current domain Until now, irq_cfg domain is mostly static. Either all CPU's (used by flat mode) or one CPU (first CPU in the irq afffinity mask) to which irq is being migrated (this is used by the rest of apic modes). Upcoming x2apic cluster mode optimization patch allows the irq to be sent to any CPU in the x2apic cluster (if supported by the HW). So irq_cfg domain changes on the fly (depending on which CPU in the x2apic cluster is online). Instead of checking for any intersection between the new irq affinity mask and the current irq_cfg domain, check if the new irq affinity mask is a subset of the current irq_cfg domain. Otherwise proceed with updating the irq_cfg domain aswell as assigning vector's on all the CPUs specified in the new mask. This also cleans up a workaround in updating irq_cfg domain for legacy irq's that are handled by the IO-APIC. Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: gorcunov@openvz.org Cc: agordeev@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1337644682-19854-1-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a9..910a3118438 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1126,8 +1126,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) old_vector = cfg->vector; if (old_vector) { cpumask_and(tmp_mask, mask, cpu_online_mask); - cpumask_and(tmp_mask, cfg->domain, tmp_mask); - if (!cpumask_empty(tmp_mask)) { + if (cpumask_subset(tmp_mask, cfg->domain)) { free_cpumask_var(tmp_mask); return 0; } @@ -1141,6 +1140,11 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) apic->vector_allocation_domain(cpu, tmp_mask); + if (cpumask_subset(tmp_mask, cfg->domain)) { + free_cpumask_var(tmp_mask); + return 0; + } + vector = current_vector; offset = current_offset; next: @@ -1346,13 +1350,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0 for legacy - * controllers like 8259. Now that IO-APIC can handle this irq, update - * the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) - apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.3-70-g09d2 From 0b8255e660a0c229ebfe8f9fde12a8d4d34c50e0 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 21 May 2012 16:58:02 -0700 Subject: x86/x2apic/cluster: Use all the members of one cluster specified in the smp_affinity mask for the interrupt destination If the HW implements round-robin interrupt delivery, this enables multiple cpu's (which are part of the user specified interrupt smp_affinity mask and belong to the same x2apic cluster) to service the interrupt. Also if the platform supports Power Aware Interrupt Routing, then this enables the interrupt to be routed to an idle cpu or a busy cpu depending on the perf/power bias tunable. We are now grouping all the cpu's in a cluster to one vector domain. So that will limit the total number of interrupt sources handled by Linux. Previously we support "cpu-count * available-vectors-per-cpu" interrupt sources but this will now reduce to "cpu-count/16 * available-vectors-per-cpu". Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: gorcunov@openvz.org Cc: agordeev@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1337644682-19854-2-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x2apic.h | 9 ------ arch/x86/kernel/apic/x2apic_cluster.c | 56 ++++++++++++++++++++++++----------- arch/x86/kernel/apic/x2apic_phys.c | 9 ++++++ 3 files changed, 48 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 92e54abf89e..7a5a832a99b 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void) return 1; } -/* - * For now each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) { diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index ff35cff0e1a..90d999c7f2e 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -98,34 +98,47 @@ static void x2apic_send_IPI_all(int vector) static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ int cpu = cpumask_first(cpumask); + u32 dest = 0; + int i; - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - else + if (cpu > nr_cpu_ids) return BAD_APICID; + + for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu)) + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + + return dest; } static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask) { - int cpu; + u32 dest = 0; + u16 cluster; + int i; - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + dest = per_cpu(x86_cpu_to_logical_apicid, i); + cluster = x2apic_cluster(i); + break; } - return per_cpu(x86_cpu_to_logical_apicid, cpu); + if (!dest) + return BAD_APICID; + + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + if (cluster != x2apic_cluster(i)) + continue; + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + } + + return dest; } static void init_x2apic_ldr(void) @@ -208,6 +221,15 @@ static int x2apic_cluster_probe(void) return 0; } +/* + * Each x2apic cluster is an allocation domain. + */ +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); +} + static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", @@ -225,7 +247,7 @@ static struct apic apic_x2apic_cluster = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c17e982db27..93b25706f17 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -120,6 +120,15 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } +/* + * Each logical cpu is in its own vector allocation domain. + */ +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + static struct apic apic_x2apic_phys = { .name = "physical x2apic", -- cgit v1.2.3-70-g09d2 From 49d0c7a0a425a89190b7c3b1445faba9eb227bec Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 5 Jun 2012 13:23:15 +0200 Subject: x86/apic: Trivial whitespace fixes Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120605112310.GA11443@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 0e881c46e8c..de279b32ceb 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -92,7 +92,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) } static void - flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 910a3118438..74c569791e7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1363,7 +1363,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, cfg->vector, irq, attr->trigger, attr->polarity, dest); if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); __clear_irq_vector(irq, cfg); @@ -1466,7 +1466,7 @@ void setup_IO_APIC_irq_extra(u32 gsi) * Set up the timer pin, possibly with the 8259A-master behind. */ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) + unsigned int pin, int vector) { struct IO_APIC_route_entry entry; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 90d999c7f2e..2919e45d30c 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) } static void - x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } -- cgit v1.2.3-70-g09d2 From bf721d3a3bc7a731add45c8078b142b494ab413e Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 5 Jun 2012 13:23:29 +0200 Subject: x86/apic: Factor out default target_cpus() operation Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120605112324.GA11449@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 5 +++++ arch/x86/include/asm/x2apic.h | 9 --------- arch/x86/kernel/apic/apic_flat_64.c | 14 ++------------ arch/x86/kernel/apic/apic_numachip.c | 7 +------ arch/x86/kernel/apic/bigsmp_32.c | 11 +---------- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 7 +------ 8 files changed, 12 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eaff4790ed9..fc38195d640 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -537,6 +537,11 @@ static inline const struct cpumask *default_target_cpus(void) #endif } +static inline const struct cpumask *online_target_cpus(void) +{ + return cpu_online_mask; +} + DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 7a5a832a99b..f90f0a587c6 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -9,15 +9,6 @@ #include #include -/* - * Need to use more than cpu 0, because we need more vectors - * when MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) -{ - return cpu_online_mask; -} - static int x2apic_apic_id_valid(int apicid) { return 1; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index de279b32ceb..61ac1afeff0 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -36,11 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static const struct cpumask *flat_target_cpus(void) -{ - return cpu_online_mask; -} - static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus @@ -186,7 +181,7 @@ static struct apic apic_flat = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = flat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, @@ -262,11 +257,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static const struct cpumask *physflat_target_cpus(void) -{ - return cpu_online_mask; -} - static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); @@ -345,7 +335,7 @@ static struct apic apic_physflat = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = physflat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 6ec6d5d297c..3255a60fcc9 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -72,11 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static const struct cpumask *numachip_target_cpus(void) -{ - return cpu_online_mask; -} - static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); @@ -253,7 +248,7 @@ static struct apic apic_numachip __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = numachip_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 31fbdbfbf96..c288e81e00f 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void) return 1; } -static const struct cpumask *bigsmp_target_cpus(void) -{ -#ifdef CONFIG_SMP - return cpu_online_mask; -#else - return cpumask_of(0); -#endif -} - static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; @@ -205,7 +196,7 @@ static struct apic apic_bigsmp = { /* phys delivery to target CPU: */ .irq_dest_mode = 0, - .target_cpus = bigsmp_target_cpus, + .target_cpus = default_target_cpus, .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 2919e45d30c..612622c47df 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -241,7 +241,7 @@ static struct apic apic_x2apic_cluster = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 93b25706f17..b1a8b39e3c3 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -140,7 +140,7 @@ static struct apic apic_x2apic_phys = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c6d03f7a440..16efb92bfea 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -185,11 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static const struct cpumask *uv_target_cpus(void) -{ - return cpu_online_mask; -} - static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); @@ -362,7 +357,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = uv_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, -- cgit v1.2.3-70-g09d2 From 6398268d2bc454735f11e08705e858f9fdf5c750 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 5 Jun 2012 13:23:44 +0200 Subject: x86/apic: Factor out default cpu_mask_to_apicid() operations Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120605112340.GA11454@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 13 +++++++++--- arch/x86/kernel/apic/apic.c | 28 +++++++++++++++++++++++++ arch/x86/kernel/apic/apic_flat_64.c | 40 ++++-------------------------------- arch/x86/kernel/apic/apic_noop.c | 4 ++-- arch/x86/kernel/apic/apic_numachip.c | 36 ++------------------------------ arch/x86/kernel/apic/bigsmp_32.c | 30 ++------------------------- arch/x86/kernel/apic/probe_32.c | 4 ++-- arch/x86/kernel/apic/x2apic_phys.c | 36 ++------------------------------ 8 files changed, 52 insertions(+), 139 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index fc38195d640..bef571769e6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -592,14 +592,14 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif static inline unsigned int -default_cpu_mask_to_apicid(const struct cpumask *cpumask) +flat_cpu_mask_to_apicid(const struct cpumask *cpumask) { return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; } static inline unsigned int -default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { unsigned long mask1 = cpumask_bits(cpumask)[0]; unsigned long mask2 = cpumask_bits(andmask)[0]; @@ -608,6 +608,13 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return (unsigned int)(mask1 & mask2 & mask3); } +extern unsigned int +default_cpu_mask_to_apicid(const struct cpumask *cpumask); + +extern unsigned int +default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask); + static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) { return physid_isset(apicid, *map); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e094a..96a2608252f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,6 +2123,34 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } +unsigned int default_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_first(cpumask); + if (likely((unsigned)cpu < nr_cpu_ids)) + return per_cpu(x86_cpu_to_apicid, cpu); + + return BAD_APICID; +} + +unsigned int +default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + return per_cpu(x86_cpu_to_apicid, cpu); +} + /* * Power management */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 61ac1afeff0..55b97ce4fa1 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -205,8 +205,8 @@ static struct apic apic_flat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, @@ -284,38 +284,6 @@ static void physflat_send_IPI_all(int vector) physflat_send_IPI_mask(cpu_online_mask, vector); } -static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int physflat_probe(void) { if (apic == &apic_physflat || num_possible_cpus() > 8) @@ -360,8 +328,8 @@ static struct apic apic_physflat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = physflat_send_IPI_mask, .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index a6e4c6e06c0..7c3dd4fe068 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -159,8 +159,8 @@ struct apic apic_noop = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = noop_send_IPI_mask, .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 3255a60fcc9..dba4bf2ed56 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -152,38 +152,6 @@ static void numachip_send_IPI_self(int vector) __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } -static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if (likely((unsigned)cpu < nr_cpu_ids)) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; -} - -static unsigned int -numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int __init numachip_probe(void) { return apic == &apic_numachip; @@ -272,8 +240,8 @@ static struct apic apic_numachip __refconst = { .set_apic_id = set_apic_id, .apic_id_mask = 0xffU << 24, - .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index c288e81e00f..907aa3d112a 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -96,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) return 1; } -/* As we are using single CPU as destination, pick only one CPU here */ -static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu = cpumask_first(cpumask); - - if (cpu < nr_cpu_ids) - return cpu_physical_id(cpu); - return BAD_APICID; -} - -static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - return cpu_physical_id(cpu); - } - return BAD_APICID; -} - static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -220,8 +194,8 @@ static struct apic apic_bigsmp = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = bigsmp_send_IPI_mask, .send_IPI_mask_allbutself = NULL, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1b291da09e6..71b6ac48ab2 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -123,8 +123,8 @@ static struct apic apic_default = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = default_send_IPI_mask_logical, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index b1a8b39e3c3..f730269edef 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - - return per_cpu(x86_cpu_to_apicid, cpu); -} - static void init_x2apic_ldr(void) { } @@ -164,8 +132,8 @@ static struct apic apic_x2apic_phys = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, -- cgit v1.2.3-70-g09d2 From fbd24153c48b8425b09c161a020483cd77da870e Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 30 May 2012 18:40:03 -0600 Subject: x86/early_printk: Replace obsolete simple_strtoul() usage with kstrtoint() Change early_serial_init() to call kstrtoul() instead of calling obsoleted simple_strtoul(). Signed-off-by: Shuah Khan Cc: Joe Perches Link: http://lkml.kernel.org/r/1338424803.3569.5.camel@lorien2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 9b9f18b4991..5e4771266f1 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -119,7 +119,7 @@ static __init void early_serial_init(char *s) unsigned char c; unsigned divisor; unsigned baud = DEFAULT_BAUD; - char *e; + ssize_t ret; if (*s == ',') ++s; @@ -127,14 +127,14 @@ static __init void early_serial_init(char *s) if (*s) { unsigned port; if (!strncmp(s, "0x", 2)) { - early_serial_base = simple_strtoul(s, &e, 16); + ret = kstrtoint(s, 16, &early_serial_base); } else { static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; - port = simple_strtoul(s, &e, 10); - if (port > 1 || s == e) + ret = kstrtouint(s, 10, &port); + if (ret || port > 1) port = 0; early_serial_base = bases[port]; } @@ -149,8 +149,8 @@ static __init void early_serial_init(char *s) outb(0x3, early_serial_base + MCR); /* DTR + RTS */ if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) + ret = kstrtouint(s, 0, &baud); + if (ret || baud == 0) baud = DEFAULT_BAUD; } -- cgit v1.2.3-70-g09d2 From bacef661acdb634170a8faddbc1cf28e8f8b9eee Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 25 May 2012 16:20:31 +0100 Subject: x86-64/efi: Use EFI to deal with platform wall clock Other than ix86, x86-64 on EFI so far didn't set the {g,s}et_wallclock accessors to the EFI routines, thus incorrectly using raw RTC accesses instead. Simply removing the #ifdef around the respective code isn't enough, however: While so far early get-time calls were done in physical mode, this doesn't work properly for x86-64, as virtual addresses would still need to be set up for all runtime regions (which wasn't the case on the system I have access to), so instead the patch moves the call to efi_enter_virtual_mode() ahead (which in turn allows to drop all code related to calling efi-get-time in physical mode). Additionally the earlier calling of efi_set_executable() requires the CPA code to cope, i.e. during early boot it must be avoided to call cpa_flush_array(), as the first thing this function does is a BUG_ON(irqs_disabled()). Also make the two EFI functions in question here static - they're not being referenced elsewhere. Signed-off-by: Jan Beulich Tested-by: Matt Fleming Acked-by: Matthew Garrett Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4FBFBF5F020000780008637F@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 10 ++++++---- arch/x86/platform/efi/efi.c | 30 ++++-------------------------- include/linux/efi.h | 2 -- init/main.c | 8 ++++---- 4 files changed, 14 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e1ebde31521..ee09aca6399 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,11 +919,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it and in the - * error case we fall back to cpa_flush_all (which uses - * wbindv): + * avoid the wbindv. If the CPU does not support it, in the + * error case, and during early boot (for EFI) we fall back + * to cpa_flush_all (which uses wbinvd): */ - if (!ret && cpu_has_clflush) { + if (early_boot_irqs_disabled) + __cpa_flush_all((void *)(long)cache); + else if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 92660edaa1e..2dc29f51e75 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -234,22 +234,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -static efi_status_t __init phys_efi_get_time(efi_time_t *tm, - efi_time_cap_t *tc) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - efi_call_phys_prelog(); - status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), - virt_to_phys(tc)); - efi_call_phys_epilog(); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -int efi_set_rtc_mmss(unsigned long nowtime) +static int efi_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes; efi_status_t status; @@ -278,7 +263,7 @@ int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -unsigned long efi_get_time(void) +static unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; @@ -621,18 +606,13 @@ static int __init efi_runtime_init(void) } /* * We will only need *early* access to the following - * two EFI runtime services before set_virtual_address_map + * EFI runtime service before set_virtual_address_map * is invoked. */ - efi_phys.get_time = (efi_get_time_t *)runtime->get_time; efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) runtime->set_virtual_address_map; - /* - * Make efi_get_time can be called before entering - * virtual mode. - */ - efi.get_time = phys_efi_get_time; + early_iounmap(runtime, sizeof(efi_runtime_services_t)); return 0; @@ -720,12 +700,10 @@ void __init efi_init(void) efi_enabled = 0; return; } -#ifdef CONFIG_X86_32 if (efi_native) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } -#endif #if EFI_DEBUG print_efi_memmap(); diff --git a/include/linux/efi.h b/include/linux/efi.h index ec45ccd8708..103adc6d7e3 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -503,8 +503,6 @@ extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); -extern unsigned long efi_get_time(void); -extern int efi_set_rtc_mmss(unsigned long nowtime); extern void efi_reserve_boot_services(void); extern struct efi_memory_map memmap; diff --git a/init/main.c b/init/main.c index 1ca6b32c482..eef30128321 100644 --- a/init/main.c +++ b/init/main.c @@ -460,6 +460,10 @@ static void __init mm_init(void) percpu_init_late(); pgtable_cache_init(); vmalloc_init(); +#ifdef CONFIG_X86 + if (efi_enabled) + efi_enter_virtual_mode(); +#endif } asmlinkage void __init start_kernel(void) @@ -601,10 +605,6 @@ asmlinkage void __init start_kernel(void) calibrate_delay(); pidmap_init(); anon_vma_init(); -#ifdef CONFIG_X86 - if (efi_enabled) - efi_enter_virtual_mode(); -#endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); -- cgit v1.2.3-70-g09d2 From a737f256bf14adf94920aa70d150ab4dcd145109 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 3 Jun 2012 21:17:48 +0300 Subject: KVM: Cleanup the kvm_print functions and introduce pr_XX wrappers Introduces a couple of print functions, which are essentially wrappers around standard printk functions, with a KVM: prefix. Functions introduced or modified are: - kvm_err(fmt, ...) - kvm_info(fmt, ...) - kvm_debug(fmt, ...) - kvm_pr_unimpl(fmt, ...) - pr_unimpl(vcpu, fmt, ...) -> vcpu_unimpl(vcpu, fmt, ...) Signed-off-by: Christoffer Dall Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 54 ++++++++++++++++++++++++------------------------ include/linux/kvm_host.h | 18 ++++++++++------ 4 files changed, 43 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af406b26..7a418783259 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __func__, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: return kvm_set_msr_common(vcpu, ecx, data); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f78662ec867..eeeb4a25aed 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4549,7 +4549,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; } vcpu->run->exit_reason = 0; - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f12a52408cd..a01a4241bc6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1437,8 +1437,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; } default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } return 0; @@ -1470,8 +1470,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } @@ -1551,15 +1551,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { - pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); return 1; } break; @@ -1574,8 +1574,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) thus reserved and should throw a #GP */ return 1; } - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: @@ -1671,8 +1671,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: if (data != 0) - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. @@ -1681,8 +1681,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: @@ -1693,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_pmu_set_msr(vcpu, msr, data); if (pr || data != 0) - pr_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_K7_CLK_CTL: /* @@ -1720,7 +1720,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) @@ -1738,12 +1738,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr, data); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); return 1; } else { - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); break; } } @@ -1846,7 +1846,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = kvm->arch.hv_hypercall; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } @@ -1877,7 +1877,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.hv_vapic; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } *pdata = data; @@ -2030,10 +2030,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; } else { - pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); data = 0; } break; @@ -4116,7 +4116,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) value = kvm_get_cr8(vcpu); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); return 0; } @@ -4145,7 +4145,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr8(vcpu, val); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); res = -1; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 19b83f6efa4..27ac8a4767f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -314,13 +314,19 @@ struct kvm { long tlbs_dirty; }; -/* The guest did something we don't support. */ -#define pr_unimpl(vcpu, fmt, ...) \ - pr_err_ratelimited("kvm: %i: cpu%i " fmt, \ - current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__) +#define kvm_err(fmt, ...) \ + pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) +#define kvm_info(fmt, ...) \ + pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) +#define kvm_debug(fmt, ...) \ + pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) +#define kvm_pr_unimpl(fmt, ...) \ + pr_err_ratelimited("kvm [%i]: " fmt, \ + task_tgid_nr(current), ## __VA_ARGS__) -#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) -#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) +/* The guest did something we don't support. */ +#define vcpu_unimpl(vcpu, fmt, ...) \ + kvm_pr_unimpl("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { -- cgit v1.2.3-70-g09d2 From 79f702a6d18c75760c68202007265b2310d6f44e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 3 Jun 2012 11:34:08 +0300 Subject: KVM: disable uninitialized var warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I see this in 3.5-rc1: arch/x86/kvm/mmu.c: In function ‘kvm_test_age_rmapp’: arch/x86/kvm/mmu.c:1271: warning: ‘iter.desc’ may be used uninitialized in this function The line in question was introduced by commit 1e3f42f03c38c29c1814199a6f0a2f01b919ea3f static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; <- line 1271 int young = 0; /* The reason I think is that the compiler assumes that the rmap value could be 0, so static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) { if (!rmap) return NULL; if (!(rmap & 1)) { iter->desc = NULL; return (u64 *)rmap; } iter->desc = (struct pte_list_desc *)(rmap & ~1ul); iter->pos = 0; return iter->desc->sptes[iter->pos]; } will not initialize iter.desc, but the compiler isn't smart enough to see that for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { will immediately exit in this case. I checked by adding if (!*rmapp) goto out; on top which is clearly equivalent but disables the warning. This patch uses uninitialized_var to disable the warning without increasing code size. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ca7164a74f..24dd43d45ae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1238,7 +1238,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { u64 *sptep; - struct rmap_iterator iter; + struct rmap_iterator uninitialized_var(iter); int young = 0; /* -- cgit v1.2.3-70-g09d2 From 5a425294ee7d4ab5a374248e85838dfd450caf75 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 15:30:31 +0200 Subject: perf/x86: Fix Intel shared extra MSR allocation Zheng Yan reported that event group validation can wreck event state when Intel extra_reg allocation changes event state. Validation shouldn't change any persistent state. Cloning events in validate_{event,group}() isn't really pretty either, so add a few special cases to avoid modifying the event state. The code is restructured to minimize the special case impact. Reported-by: Zheng Yan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338903031.28282.175.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 92 +++++++++++++++++++++++----------- 3 files changed, 66 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da018..cb608383e4f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6638aaf5449..83794d8e6af 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -117,6 +117,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ unsigned int group_flag; + int is_fake; /* * Intel DebugStore bits diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6ae..965baa2fa79 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +static int intel_alt_er(int idx) { if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) - return false; + return idx; - if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; - event->hw.extra_reg.idx = EXTRA_REG_RSP_1; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; event->hw.config |= 0x01b7; - event->hw.extra_reg.idx = EXTRA_REG_RSP_0; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } - - if (event->hw.extra_reg.idx == orig_idx) - return false; - - return true; } /* @@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct event_constraint *c = &emptyconstraint; struct er_account *era; unsigned long flags; - int orig_idx = reg->idx; + int idx = reg->idx; - /* already allocated shared msr */ - if (reg->alloc) + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */ again: - era = &cpuc->shared_regs->regs[reg->idx]; + era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc @@ -1173,6 +1183,29 @@ again: if (!atomic_read(&era->ref) || era->config == reg->config) { + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + /* lock in msr value */ era->config = reg->config; era->reg = reg->reg; @@ -1180,17 +1213,17 @@ again: /* one more user */ atomic_inc(&era->ref); - /* no need to reallocate during incremental event scheduling */ - reg->alloc = 1; - /* * need to call x86_get_event_constraint() * to check if associated event has constraints */ c = NULL; - } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, struct er_account *era; /* - * only put constraint if extra reg was actually - * allocated. Also takes care of event which do - * not use an extra shared reg + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. */ - if (!reg->alloc) + if (!reg->alloc || cpuc->is_fake) return; era = &cpuc->shared_regs->regs[reg->idx]; -- cgit v1.2.3-70-g09d2 From 0780c927a02492f917a74f51f3c801c76a637c57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: perf/x86: Implement cycles:p for SNB/IVB Now that there's finally a chip with working PEBS (IvyBridge), we can enable the hardware and implement cycles:p for SNB/IVB. Cc: Stephane Eranian Requested-and-tested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 50 ++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 83794d8e6af..7241e2fc3c1 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -365,6 +365,7 @@ struct x86_pmu { int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 965baa2fa79..2312c1ff1b1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1336,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_shared_regs_event_constraints(cpuc, event); } -static int intel_pmu_hw_config(struct perf_event *event) +static void intel_pebs_aliases_core2(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip && - (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. @@ -1365,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event) */ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); if (intel_pmu_needs_lbr_smpl(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -1643,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, @@ -1885,6 +1918,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; -- cgit v1.2.3-70-g09d2 From 47a8863dbb11745446314ca126593789ab74d93a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: perf/x86: Enable/Add IvyBridge hardware support Implement rudimentary IVB perf support. The SDM states its identical to SNB with exception of the exact event tables, but a quick look suggests they're similar enough. Also mark SNB-EP as broken for now. Requested-and-tested-by: Linus Torvalds Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2312c1ff1b1..187c294bc65 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1909,8 +1909,9 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.2.3-70-g09d2 From 212d95dfdb66e5c81879b08e4f7fbfc8498b1ab5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: perf/x86: Update SNB PEBS constraints Afaict there's no need to (incompletely) iterate the MEM_UOPS_RETIRED.* umask state. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 5a3edc27f6e..35e2192df9f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ -- cgit v1.2.3-70-g09d2 From 1c2ac3fde3e35279958e7b0408e2dcf866465301 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 May 2012 15:25:34 +0200 Subject: perf/x86: Fix wrmsrl() debug wrapper Move the wrmslr() debug wrapper to the common header now that all the include games are gone. Also clean it up a bit to avoid multiple evaluation of the argument. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-l4gkfnivwv4yi5mqxjlovymx@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 11 ----------- arch/x86/kernel/cpu/perf_event.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c4706cf9c01..43c2017347e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -35,17 +35,6 @@ #include "perf_event.h" -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ - (unsigned long)(val)); \ - native_write_msr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)); \ -} while (0) -#endif - struct x86_pmu x86_pmu __read_mostly; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7241e2fc3c1..23b5710b174 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -14,6 +14,18 @@ #include +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + unsigned int _msr = (msr); \ + u64 _val = (val); \ + trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ + (unsigned long long)(_val)); \ + native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ +} while (0) +#endif + /* * | NHM/WSM | SNB | * register ------------------------------- -- cgit v1.2.3-70-g09d2 From 1ff4d58a192aea7f245981e2579765f961f6eb9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Jun 2012 17:56:50 -0700 Subject: x86: Add rdpmcl() Add a version of rdpmc() that directly reads into a u64 Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338944211-28275-4-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 2 ++ arch/x86/include/asm/paravirt.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 084ef95274c..e489c1475be 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -237,6 +237,8 @@ do { \ (high) = (u32)(_l >> 32); \ } while (0) +#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) + #define rdtscp(low, high, aux) \ do { \ unsigned long long _val = native_read_tscp(&(aux)); \ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf5270..14ce05dfe04 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -252,6 +252,8 @@ do { \ high = _l >> 32; \ } while (0) +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) + static inline unsigned long long paravirt_rdtscp(unsigned int *aux) { return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); -- cgit v1.2.3-70-g09d2 From c48b60538c3ba05a7a2713c4791b25405525431b Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Thu, 1 Mar 2012 17:28:14 -0500 Subject: perf/x86: Use rdpmc() rather than rdmsr() when possible in the kernel The rdpmc instruction is faster than the equivelant rdmsr call, so use it when possible in the kernel. The perfctr kernel patches did this, after extensive testing showed rdpmc to always be faster (One can look in etc/costs in the perfctr-2.6 package to see a historical list of the overhead). I have done some tests on a 3.2 kernel, the kernel module I used was included in the first posting of this patch: rdmsr rdpmc Core2 T9900: 203.9 cycles 30.9 cycles AMD fam0fh: 56.2 cycles 9.8 cycles Atom 6/28/2: 129.7 cycles 50.6 cycles The speedup of using rdpmc is large. [ It's probably possible (and desirable) to do this without requiring a new field in the hw_perf_event structure, but the fixed events make this tricky. ] Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1203011724030.26934@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 +++- include/linux/perf_event.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 43c2017347e..000a4746c7c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -75,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base, new_raw_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -819,9 +819,11 @@ static inline void x86_assign_hw_event(struct perf_event *event, } else if (hwc->idx >= X86_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - X86_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = x86_pmu_addr_offset(hwc->idx); } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 45db49f64bb..1ce887abcc5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -677,6 +677,7 @@ struct hw_perf_event { u64 last_tag; unsigned long config_base; unsigned long event_base; + int event_base_rdpmc; int idx; int last_cpu; -- cgit v1.2.3-70-g09d2 From 70ab7003dec58afeae7f5d681dfa309b3a259f03 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Jun 2012 17:56:48 -0700 Subject: perf/x86: Don't assume there can be only 4 PEBS events On Sandy Bridge in non HT mode there are 8 counters available. Since every counter can write a PEBS record assuming there are 4 max is incorrect. Use the reported counter number -- with an upper limit for a static array -- instead. Also I made the warning messages a bit more informational. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338944211-28275-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 3 ++- arch/x86/kernel/cpu/perf_event_intel.c | 2 ++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 ++++---- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 23b5710b174..3df3de9452a 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -69,7 +69,7 @@ struct amd_nb { }; /* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#define MAX_PEBS_EVENTS 8 /* * A debug store configuration. @@ -378,6 +378,7 @@ struct x86_pmu { void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 187c294bc65..e23e71f2526 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1800,6 +1800,8 @@ __init int intel_pmu_init(void) x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 35e2192df9f..026373edef7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -620,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > 1); + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); at += n - 1; __intel_pmu_pebs_event(event, iregs, at); @@ -651,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > MAX_PEBS_EVENTS); + WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n); for ( ; at < top; at++) { - for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { + for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) { event = cpuc->events[bit]; if (!test_bit(bit, cpuc->active_mask)) continue; @@ -670,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) break; } - if (!event || bit >= MAX_PEBS_EVENTS) + if (!event || bit >= x86_pmu.max_pebs_events) continue; __intel_pmu_pebs_event(event, iregs, at); -- cgit v1.2.3-70-g09d2 From 4bfaddf15bac7afa7048d105864dab65c5d1f9e7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Jun 2012 21:26:30 +0000 Subject: x86 bpf_jit: support BPF_S_ANC_ALU_XOR_X instruction commit ffe06c17afbb (filter: add XOR operation) added generic support for XOR operation. This patch implements the XOR instruction in x86 jit. Signed-off-by: Eric Dumazet Cc: Jiri Pirko Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0597f95b6da..33643a8bcbb 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -309,6 +309,10 @@ void bpf_jit_compile(struct sk_filter *fp) else EMIT1_off32(0x0d, K); /* or imm32,%eax */ break; + case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ + seen |= SEEN_XREG; + EMIT2(0x31, 0xd8); /* xor %ebx,%eax */ + break; case BPF_S_ALU_LSH_X: /* A <<= X; */ seen |= SEEN_XREG; EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ -- cgit v1.2.3-70-g09d2 From 24214449b00b94328e239d3c35cda3e6fe0f931b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 4 May 2012 18:28:21 +0200 Subject: x86, amd_nb: Export model 0x10 and later PCI id Add the F3 PCI id of F15h, model 0x10 to pci_ids.h and to the amd_nb code which generates the list of northbridges on an AMD box. Shorten define name while at it so that it fits into pci_ids.h. Acked-by: Clemens Ladisch Cc: Bjorn Helgaas Acked-by: Andreas Herrmann Signed-off-by: Borislav Petkov --- arch/x86/kernel/amd_nb.c | 1 + drivers/hwmon/k10temp.c | 5 +---- include/linux/pci_ids.h | 1 + 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854591c..153a0ee88fb 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -16,6 +16,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 7356b5ec8f6..f2fe8078633 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -33,9 +33,6 @@ static bool force; module_param(force, bool, 0444); MODULE_PARM_DESC(force, "force loading on processors with erratum 319"); -/* PCI-IDs for Northbridge devices not used anywhere else */ -#define PCI_DEVICE_ID_AMD_15H_M10H_NB_F3 0x1403 - /* CPUID function 0x80000001, ebx */ #define CPUID_PKGTYPE_MASK 0xf0000000 #define CPUID_PKGTYPE_F 0x00000000 @@ -213,7 +210,7 @@ static DEFINE_PCI_DEVICE_TABLE(k10temp_id_table) = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M10H_NB_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, {} }; MODULE_DEVICE_TABLE(pci, k10temp_id_table); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index ab741b0d007..05fd02e4499 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -517,6 +517,7 @@ #define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302 #define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 #define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 +#define PCI_DEVICE_ID_AMD_15H_M10H_F3 0x1403 #define PCI_DEVICE_ID_AMD_15H_NB_F0 0x1600 #define PCI_DEVICE_ID_AMD_15H_NB_F1 0x1601 #define PCI_DEVICE_ID_AMD_15H_NB_F2 0x1602 -- cgit v1.2.3-70-g09d2 From 92e26e2a1a7d5fe6538e4a676f9c383966f894a7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 May 2012 16:20:49 +0200 Subject: x86, MCE, AMD: Remove shared banks sysfs linking The code used to create a symlink on all non-BSP cores of a node when the MCi_MISCj bank is present once per node. (This is generally the case with bank 4 on AMD). However, these sysfs links cause a bunch of problems with cpu off-/onlining testing and are, as such, a bit overengineered. IOW, there's nothing wrong with having normal sysfs files for the shared banks since the corresponding MSRs are replicated across each core anyway. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 105 +++-------------------------------- 1 file changed, 7 insertions(+), 98 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f4873a64f46..2a5dd30f996 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -60,7 +60,6 @@ struct threshold_block { struct threshold_bank { struct kobject *kobj; struct threshold_block *blocks; - cpumask_var_t cpus; }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); @@ -224,8 +223,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); - if (shared_bank[bank] && c->cpu_core_id) - break; memset(&b, 0, sizeof(b)); b.cpu = cpu; @@ -555,89 +552,35 @@ local_allocate_threshold_blocks(int cpu, unsigned int bank) MSR_IA32_MC0_MISC + bank * 4); } -/* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { - int i, err = 0; - struct threshold_bank *b = NULL; struct device *dev = per_cpu(mce_device, cpu); + struct threshold_bank *b = NULL; char name[32]; + int err = 0; sprintf(name, "threshold_bank%i", bank); -#ifdef CONFIG_SMP - if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_llc_shared_mask(cpu)); - - /* first core not up yet */ - if (cpu_data(i).cpu_core_id) - goto out; - - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; - - b = per_cpu(threshold_banks, i)[bank]; - - if (!b) - goto out; - - err = sysfs_create_link(&dev->kobj, b->kobj, name); - if (err) - goto out; - - cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); - per_cpu(threshold_banks, cpu)[bank] = b; - - goto out; - } -#endif - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; goto out; } - if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { - kfree(b); - err = -ENOMEM; - goto out; - } b->kobj = kobject_create_and_add(name, &dev->kobj); - if (!b->kobj) + if (!b->kobj) { + err = -EINVAL; goto out_free; - -#ifndef CONFIG_SMP - cpumask_setall(b->cpus); -#else - cpumask_set_cpu(cpu, b->cpus); -#endif + } per_cpu(threshold_banks, cpu)[bank] = b; err = local_allocate_threshold_blocks(cpu, bank); - if (err) - goto out_free; - - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - dev = per_cpu(mce_device, i); - if (dev) - err = sysfs_create_link(&dev->kobj,b->kobj, name); - if (err) - goto out; - - per_cpu(threshold_banks, i)[bank] = b; - } - - goto out; + if (!err) + goto out; out_free: per_cpu(threshold_banks, cpu)[bank] = NULL; - free_cpumask_var(b->cpus); kfree(b); out: return err; @@ -660,12 +603,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu) return err; } -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -689,9 +626,6 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { struct threshold_bank *b; - struct device *dev; - char name[32]; - int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; if (!b) @@ -699,36 +633,11 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (!b->blocks) goto free_out; - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - dev = per_cpu(mce_device, cpu); - sysfs_remove_link(&dev->kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - dev = per_cpu(mce_device, i); - if (dev) - sysfs_remove_link(&dev->kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; - } - deallocate_threshold_block(cpu, bank); free_out: kobject_del(b->kobj); kobject_put(b->kobj); - free_cpumask_var(b->cpus); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; } -- cgit v1.2.3-70-g09d2 From 26ab256eaac7af26ecd9ba893b5159a3b38c8a1c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 May 2012 16:43:02 +0200 Subject: x86, MCE, AMD: Remove local_allocate_... wrapper It is unneeded now so drop it. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 2a5dd30f996..7fd02cac962 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -61,6 +61,7 @@ struct threshold_bank { struct kobject *kobj; struct threshold_block *blocks; }; + static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); static unsigned char shared_bank[NR_BANKS] = { @@ -545,13 +546,6 @@ out_free: return err; } -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) -{ - return allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); -} - static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { struct device *dev = per_cpu(mce_device, cpu); @@ -575,7 +569,8 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; - err = local_allocate_threshold_blocks(cpu, bank); + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); if (!err) goto out; -- cgit v1.2.3-70-g09d2 From 019f34fccfd5cf5ff1e722dafd9fe2bd54434e66 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 May 2012 17:16:59 +0200 Subject: x86, MCE, AMD: Move shared bank to node descriptor Well, instead of having a real bank 4 on the BSP of each node and symlinks on the remaining cores, we push it up into the amd_northbridge descriptor which now contains a pointer to the northbridge bank 4 because the bank is one per northbridge and, as such, belongs in the NB descriptor anyway. Each time we hotplug CPUs, we use the northbridge pointer to copy the shared bank into the per-CPU array of threshold_banks pointers, or destroy it when the last CPU on the node goes offline, or create it when the first comes online. Signed-off-by: Borislav Petkov --- arch/x86/include/asm/amd_nb.h | 21 +++++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 107 ++++++++++++++++++++++++++++------- 2 files changed, 108 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 49ad773f4b9..b3341e9cd8f 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -26,10 +26,31 @@ struct amd_l3_cache { u8 subcaches[4]; }; +struct threshold_block { + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + bool interrupt_capable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; + + /* initialized to the number of CPUs on the node sharing this bank */ + atomic_t cpus; +}; + struct amd_northbridge { struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; + struct threshold_bank *bank4; }; struct amd_northbridge_info { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 7fd02cac962..d67c9e56d60 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -45,23 +46,6 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - bool interrupt_capable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; -}; - static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); static unsigned char shared_bank[NR_BANKS] = { @@ -546,15 +530,62 @@ out_free: return err; } +static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) +{ + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; + + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; + + list_for_each_entry_safe(pos, tmp, head, miscj) { + + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); + + return err; + } + } + return err; +} + static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { struct device *dev = per_cpu(mce_device, cpu); + struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; char name[32]; int err = 0; sprintf(name, "threshold_bank%i", bank); + if (shared_bank[bank]) { + + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + WARN_ON(!nb); + + /* threshold descriptor already initialized on this node? */ + if (nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, cpu)[bank] = b; + atomic_inc(&b->cpus); + + err = __threshold_add_blocks(b); + + goto out; + } + } + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; @@ -569,15 +600,23 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; + if (shared_bank[bank]) { + atomic_set(&b->cpus, 1); + + /* nb is already initialized, see above */ + WARN_ON(nb->bank4); + nb->bank4 = b; + } + err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MC0_MISC + bank * 4); if (!err) goto out; -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; + out_free: kfree(b); -out: + + out: return err; } @@ -618,16 +657,44 @@ static void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + static void threshold_remove_bank(unsigned int cpu, int bank) { + struct amd_northbridge *nb; struct threshold_bank *b; b = per_cpu(threshold_banks, cpu)[bank]; if (!b) return; + if (!b->blocks) goto free_out; + if (shared_bank[bank]) { + if (!atomic_dec_and_test(&b->cpus)) { + __threshold_remove_blocks(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } else { + /* + * the last CPU on this node using the shared bank is + * going away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb->bank4 = NULL; + } + } + deallocate_threshold_block(cpu, bank); free_out: -- cgit v1.2.3-70-g09d2 From 18c20f373b76a64270a991396b06542abaf9f530 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Apr 2012 12:31:34 +0200 Subject: x86, MCE, AMD: Print decimal thresholding values If one sets the threshold limit, say to 25: $ echo 25 > machinecheck0/threshold_bank4/misc0/threshold_limit and then reads it back again, it gives $ cat machinecheck0/threshold_bank4/misc0/threshold_limit 19 which is actually 0x19 but we don't know that. Make all output decimal. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index d67c9e56d60..0b1bb0e1588 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -308,7 +308,7 @@ struct threshold_attr { #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ { \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) @@ -379,7 +379,7 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf) struct threshold_block_cross_cpu tbcc = { .tb = b, }; smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lx\n", tbcc.retval); + return sprintf(buf, "%lu\n", tbcc.retval); } static ssize_t store_error_count(struct threshold_block *b, -- cgit v1.2.3-70-g09d2 From 2c9c42fa98c283961b7f6b6542fb4bf0c0539e15 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Apr 2012 12:53:59 +0200 Subject: x86, MCE, AMD: Cleanup reading of error_count We have rdmsr_on_cpu() now so remove locally defined solution in favor of the generic one. No functionality change. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 0b1bb0e1588..a7204ef3722 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -359,27 +359,14 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) return size; } -struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ - struct threshold_block_cross_cpu *tbcc = _tbcc; - struct threshold_block *b = tbcc->tb; - u32 low, high; - - rdmsr(b->address, low, high); - tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} - static ssize_t show_error_count(struct threshold_block *b, char *buf) { - struct threshold_block_cross_cpu tbcc = { .tb = b, }; + u32 lo, hi; + + rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); - smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lu\n", tbcc.retval); + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); } static ssize_t store_error_count(struct threshold_block *b, -- cgit v1.2.3-70-g09d2 From 6e927361bd403dbf5f6a2668a2a07df1f1b2daff Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Apr 2012 15:37:25 +0200 Subject: x86, MCE, AMD: Make error_count read only Until now, writing to error count caused the code to reset the thresholding bank to the current thresholding limit and start counting errors from the beginning. This is misleading and unclear, and can be accomplished by writing the old thresholding limit into ->threshold_limit. Make error_count read-only with the functionality to show the current error count. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index a7204ef3722..e5ed2c7cb4d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -369,14 +369,10 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf) (THRESHOLD_MAX - b->threshold_limit))); } -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return 1; -} +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; #define RW_ATTR(val) \ static struct threshold_attr val = { \ @@ -387,7 +383,6 @@ static struct threshold_attr val = { \ RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); -RW_ATTR(error_count); static struct attribute *default_attrs[] = { &threshold_limit.attr, -- cgit v1.2.3-70-g09d2 From 336d335a963a5a4f5d7f915b4d74ada5d7b4d05b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 4 May 2012 17:05:27 +0200 Subject: x86, MCE, AMD: Give proper names to the thresholding banks Having the banks numbered is ok but having real names which mean something to the user makes a lot more sense: /sys/devices/system/machinecheck/machinecheck0/ |-- bank0 |-- bank1 |-- bank2 |-- bank3 |-- bank4 |-- bank5 |-- bank6 |-- check_interval |-- cmci_disabled |-- combined_unit | |-- combined_unit | |-- error_count | |-- threshold_limit |-- dont_log_ce |-- execution_unit | |-- execution_unit | |-- error_count | |-- threshold_limit |-- ignore_ce |-- insn_fetch | |-- insn_fetch | |-- error_count | |-- threshold_limit |-- load_store | |-- load_store | |-- error_count | |-- threshold_limit |-- monarch_timeout |-- northbridge | |-- dram | | |-- error_count | | |-- interrupt_enable | | |-- threshold_limit | |-- ht_links | | |-- error_count | | |-- interrupt_enable | | |-- threshold_limit | |-- l3_cache | |-- error_count | |-- interrupt_enable | |-- threshold_limit ... Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e5ed2c7cb4d..e20bdf8d7c5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -46,6 +46,15 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "", + "northbridge", + "execution_unit", +}; + static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); static unsigned char shared_bank[NR_BANKS] = { @@ -68,6 +77,26 @@ struct thresh_restart { u16 old_limit; }; +static const char * const bank4_names(struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) { /* @@ -481,7 +510,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - "misc%i", block); + (bank == 4 ? bank4_names(b) : th_names[bank])); if (err) goto out_free; recurse: @@ -541,11 +570,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - char name[32]; + const char *name = th_names[bank]; int err = 0; - sprintf(name, "threshold_bank%i", bank); - if (shared_bank[bank]) { nb = node_to_amd_nb(amd_get_nb_id(cpu)); -- cgit v1.2.3-70-g09d2 From 11122570193a429f72014703ce6b849640f8d7e5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 7 May 2012 18:22:16 +0200 Subject: x86, MCE, AMD: Update copyrights and boilerplate Jacob is doing something else now so add myself as the loser who provides support. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e20bdf8d7c5..671b95a2ffb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,15 +1,17 @@ /* - * (c) 2005, 2006 Advanced Micro Devices, Inc. + * (c) 2005-2012 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. * - * Support : jacob.shin@amd.com + * Support: borislav.petkov@amd.com * * April 2006 * - added support for AMD Family 0x10 processors + * May 2012 + * - major scrubbing * * All MC4_MISCi registers are shared between multi-cores */ -- cgit v1.2.3-70-g09d2 From 7fbb98c5cb07563d3ee08714073a8e5452a96be2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jun 2012 10:21:21 -0400 Subject: x86: Save cr2 in NMI in case NMIs take a page fault Avi Kivity reported that page faults in NMIs could cause havic if the NMI preempted another page fault handler: The recent changes to NMI allow exceptions to take place in NMI handlers, but I think that a #PF (say, due to access to vmalloc space) is still problematic. Consider the sequence #PF (cr2 set by processor) NMI ... #PF (cr2 clobbered) do_page_fault() IRET ... IRET do_page_fault() address = read_cr2() The last line reads the overwritten cr2 value. Originally I wrote a patch to solve this by saving the cr2 on the stack. Brian Gerst suggested to save it in the r12 register as both r12 and rbx are saved by the do_nmi handler as required by the C standard. But rbx is already used for saving if swapgs needs to be run on exit of the NMI handler. Link: http://lkml.kernel.org/r/4FBB8C40.6080304@redhat.com Link: http://lkml.kernel.org/r/1337763411.13348.140.camel@gandalf.stny.rr.com Reported-by: Avi Kivity Cc: Linus Torvalds Cc: H. Peter Anvin Cc: Thomas Gleixner Suggested-by: Brian Gerst Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133b51b..111f6bbd8b3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1758,10 +1758,30 @@ end_repeat_nmi: */ call save_paranoid DEFAULT_FRAME 0 + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: -- cgit v1.2.3-70-g09d2 From 1f975f78c84c852e09463a2dfa57e3174e5c719e Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 1 Jun 2012 16:52:35 +0200 Subject: x86, pvops: Remove hooks for {rd,wr}msr_safe_regs There were paravirt_ops hooks for the full register set variant of {rd,wr}msr_safe which are actually not used by anyone anymore. Remove them to make the code cleaner and avoid silent breakages when the pvops members were uninitialized. This has been boot-tested natively and under Xen with PVOPS enabled and disabled on one machine. Signed-off-by: Andre Przywara Link: http://lkml.kernel.org/r/1338562358-28182-2-git-send-email-bp@amd64.org Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 67 +++++++++++++++-------------------- arch/x86/include/asm/paravirt.h | 39 -------------------- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/kernel/paravirt.c | 2 -- arch/x86/lib/msr-reg-export.c | 4 +-- arch/x86/lib/msr-reg.S | 10 +++--- arch/x86/xen/enlighten.c | 2 -- 7 files changed, 35 insertions(+), 91 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 084ef95274c..81860cc012d 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr, extern unsigned long long native_read_tsc(void); -extern int native_rdmsr_safe_regs(u32 regs[8]); -extern int native_wrmsr_safe_regs(u32 regs[8]); +extern int rdmsr_safe_regs(u32 regs[8]); +extern int wrmsr_safe_regs(u32 regs[8]); static __always_inline unsigned long long __native_read_tsc(void) { @@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = native_rdmsr_safe_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return native_wrmsr_safe_regs(gprs); -} - -static inline int rdmsr_safe_regs(u32 regs[8]) -{ - return native_rdmsr_safe_regs(regs); -} - -static inline int wrmsr_safe_regs(u32 regs[8]) -{ - return native_wrmsr_safe_regs(regs); -} - #define rdtscl(low) \ ((low) = (u32)__native_read_tsc()) @@ -248,6 +211,32 @@ do { \ #endif /* !CONFIG_PARAVIRT */ +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + u32 gprs[8] = { 0 }; + int err; + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} #define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf5270..ebb0cdb60a8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err) return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); } -static inline int paravirt_rdmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); -} - static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); } -static inline int paravirt_wrmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); -} - /* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ @@ -176,9 +166,6 @@ do { \ _err; \ }) -#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs) -#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs) - static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; @@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) *p = paravirt_read_msr(msr, &err); return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = paravirt_rdmsr_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return paravirt_wrmsr_regs(gprs); -} static inline u64 paravirt_read_tsc(void) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8e8b9a4987e..8613cbb7ba4 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -153,9 +153,7 @@ struct pv_cpu_ops { /* MSR, PMC and TSR operations. err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ u64 (*read_msr)(unsigned int msr, int *err); - int (*rdmsr_regs)(u32 *regs); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - int (*wrmsr_regs)(u32 *regs); u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9ce885996fd..17fff18a103 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c index a311cc59b65..8d6ef78b5d0 100644 --- a/arch/x86/lib/msr-reg-export.c +++ b/arch/x86/lib/msr-reg-export.c @@ -1,5 +1,5 @@ #include #include -EXPORT_SYMBOL(native_rdmsr_safe_regs); -EXPORT_SYMBOL(native_wrmsr_safe_regs); +EXPORT_SYMBOL(rdmsr_safe_regs); +EXPORT_SYMBOL(wrmsr_safe_regs); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 69fa10623f2..f6d13eefad1 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -6,13 +6,13 @@ #ifdef CONFIG_X86_64 /* - * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]); * * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] * */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushq_cfi %rbx pushq_cfi %rbp @@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #else /* X86_32 */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushl_cfi %ebx pushl_cfi %ebp @@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #endif diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e74df9548a0..60f1131eb94 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1116,9 +1116,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = xen_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, -- cgit v1.2.3-70-g09d2 From ecd431d95aa04257e977fd6878e4203ce462e7e9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 1 Jun 2012 16:52:36 +0200 Subject: x86, cpu: Fix show_msr MSR accessing function There's no real reason why, when showing the MSRs on a CPU at boottime, we should be using the AMD-specific variant. Simply use the generic safe one which handles #GPs just fine. Cc: Yinghai Lu Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1338562358-28182-3-git-send-email-bp@amd64.org Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b9333b429b..5bbc082c47a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -947,7 +947,7 @@ static void __cpuinit __print_cpu_msr(void) index_max = msr_range_array[i].max; for (index = index_min; index < index_max; index++) { - if (rdmsrl_amd_safe(index, &val)) + if (rdmsrl_safe(index, &val)) continue; printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } -- cgit v1.2.3-70-g09d2 From 169e9cbd77db23fe50bc8ba68bf081adb67b4220 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 1 Jun 2012 16:52:37 +0200 Subject: x86, cpu, amd: Fix crash as Xen Dom0 on AMD Trinity systems f7f286a910221 ("x86/amd: Re-enable CPU topology extensions in case BIOS has disabled it") wrongfully added code which used the AMD-specific {rd,wr}msr variants for no real reason. This caused boot panics on xen which wasn't initializing the {rd,wr}msr_safe_regs pv_ops members properly. This, in turn, caused a heated discussion leading to us reviewing all uses of the AMD-specific variants and removing them where unneeded (almost everywhere except an obscure K8 BIOS fix, see 6b0f43ddfa358). Finally, this patch switches to the standard {rd,wr}msr*_safe* variants which should've been used in the first place anyway and avoided unneeded excitation with xen. Signed-off-by: Andre Przywara Link: http://lkml.kernel.org/r/1338562358-28182-4-git-send-email-bp@amd64.org Cc: Andreas Herrmann Link: [Boris: correct and expand commit message] Signed-off-by: Borislav Petkov Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 146bb6218ee..80ccd99542e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -586,9 +586,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_TOPOEXT)) { u64 val; - if (!rdmsrl_amd_safe(0xc0011005, &val)) { + if (!rdmsrl_safe(0xc0011005, &val)) { val |= 1ULL << 54; - wrmsrl_amd_safe(0xc0011005, val); + checking_wrmsrl(0xc0011005, val); rdmsrl(0xc0011005, val); if (val & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); -- cgit v1.2.3-70-g09d2 From 2c929ce6f1ed1302be225512b433e6a6554f71a4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 1 Jun 2012 16:52:38 +0200 Subject: x86, cpu, amd: Deprecate AMD-specific MSR variants Now that all users of {rd,wr}msr_amd_safe have been fixed, deprecate its use by making them private to amd.c and adding warnings when used on anything else beside K8. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1338562358-28182-5-git-send-email-bp@amd64.org Acked-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 27 --------------------------- arch/x86/kernel/cpu/amd.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 81860cc012d..cb33b5f0026 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -211,33 +211,6 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = rdmsr_safe_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return wrmsr_safe_regs(gprs); -} - #define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 80ccd99542e..c928eb26ada 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -19,6 +19,39 @@ #include "cpu.h" +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + #ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause -- cgit v1.2.3-70-g09d2 From 715c85b1fc824e9cd0ea07d6ceb80d2262f32e90 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 Jun 2012 13:32:04 -0700 Subject: x86, cpu: Rename checking_wrmsrl() to wrmsrl_safe() Rename checking_wrmsrl() to wrmsrl_safe(), to match the naming convention used by all the other MSR access functions/macros. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 2 +- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 6 +++--- arch/x86/kernel/cpu/perf_event_p4.c | 14 +++++++------- arch/x86/kernel/cpu/perf_event_p6.c | 4 ++-- arch/x86/kernel/process_64.c | 4 ++-- arch/x86/vdso/vdso32-setup.c | 6 +++--- 8 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index cb33b5f0026..fe83d74a920 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -211,7 +211,7 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ +#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) #define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c928eb26ada..9d92e19039f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -621,7 +621,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (!rdmsrl_safe(0xc0011005, &val)) { val |= 1ULL << 54; - checking_wrmsrl(0xc0011005, val); + wrmsrl_safe(0xc0011005, val); rdmsrl(0xc0011005, val); if (val & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); @@ -712,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); if (err == 0) { mask |= (1 << 10); - checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); } } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da018..4e3ba9cb5a4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -222,7 +222,7 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu_event_addr(0), val); + ret = wrmsrl_safe(x86_pmu_event_addr(0), val); ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); if (ret || val != val_new) goto msr_fail; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6ae..7789aa37c74 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1003,11 +1003,11 @@ static void intel_pmu_reset(void) printk("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); - checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); if (ds) ds->bts_index = ds->bts_buffer_base; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 47124a73dd7..6c82e403798 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void) * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * - * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); - * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0); */ } @@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config) bind = &p4_pebs_bind_map[idx]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); + (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event) */ p4_pmu_enable_pebs(hwc->config); - (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(escr_addr, escr_conf); + (void)wrmsrl_safe(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 32bcfc7dd23..e4dd0f7a045 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7fdf09..3e215ba6876 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) task->thread.gs = addr; if (doit) { load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); } } put_cpu(); @@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = checking_wrmsrl(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, addr); } } put_cpu(); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 66e6d935982..0faad646f5f 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -205,9 +205,9 @@ void syscall32_cpu_init(void) { /* Load these always in case some future AMD CPU supports SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); wrmsrl(MSR_CSTAR, ia32_cstar_target); } -- cgit v1.2.3-70-g09d2 From 9d8e10667624ea6411f04495aef1fa4a8a778ee8 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:14:49 +0200 Subject: x86/apic: Factor out default vector_allocation_domain() operation Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131449.GC4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 21 +++++++++++++++++++++ arch/x86/kernel/apic/apic_flat_64.c | 22 +--------------------- arch/x86/kernel/apic/apic_noop.c | 3 +-- arch/x86/kernel/apic/apic_numachip.c | 8 +------- arch/x86/kernel/apic/bigsmp_32.c | 8 +------- arch/x86/kernel/apic/es7000_32.c | 19 ++----------------- arch/x86/kernel/apic/numaq_32.c | 16 +--------------- arch/x86/kernel/apic/probe_32.c | 17 +---------------- arch/x86/kernel/apic/summit_32.c | 16 +--------------- arch/x86/kernel/apic/x2apic_phys.c | 11 +---------- arch/x86/kernel/apic/x2apic_uv_x.c | 8 +------- 11 files changed, 32 insertions(+), 117 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bef571769e6..feb2dbdae9e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -615,6 +615,27 @@ extern unsigned int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask); +static inline void +flat_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; +} + +static inline void +default_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_copy(retmask, cpumask_of(cpu)); +} + static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) { return physid_isset(apicid, *map); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 55b97ce4fa1..bddc92566d0 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -36,20 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* * Set up the logical destination ID. * @@ -257,12 +243,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { default_send_IPI_mask_sequence_phys(cpumask, vector); @@ -309,7 +289,7 @@ static struct apic apic_physflat = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = physflat_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ .init_apic_ldr = flat_init_apic_ldr, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7c3dd4fe068..3e43cf52893 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -104,8 +104,7 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); + cpumask_copy(retmask, cpumask_of(cpu)); } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index dba4bf2ed56..c028132ad35 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -72,12 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) { union numachip_csr_g3_ext_irq_gen int_gen; @@ -222,7 +216,7 @@ static struct apic apic_numachip __refconst = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = numachip_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 907aa3d112a..df342fe4d6a 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -142,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } /* NULL entry stops DMI scanning */ }; -static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int probe_bigsmp(void) { if (def_to_bigsmp) @@ -176,7 +170,7 @@ static struct apic apic_bigsmp = { .check_apicid_used = bigsmp_check_apicid_used, .check_apicid_present = bigsmp_check_apicid_present, - .vector_allocation_domain = bigsmp_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index db4ab1be3c7..3c42865757e 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void) WARN(1, "Command failed, status = %x\n", mip_status); } -static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - - static void es7000_wait_for_init_deassert(atomic_t *deassert) { while (!atomic_read(deassert)) @@ -638,7 +623,7 @@ static struct apic __refdata apic_es7000_cluster = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr_cluster, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, @@ -705,7 +690,7 @@ static struct apic __refdata apic_es7000 = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f00a68cca37..eb2d466fd81 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -441,20 +441,6 @@ static int probe_numaq(void) return found_numaq; } -static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - static void numaq_setup_portio_remap(void) { int num_quads = num_online_nodes(); @@ -491,7 +477,7 @@ static struct apic __refdata apic_numaq = { .check_apicid_used = numaq_check_apicid_used, .check_apicid_present = numaq_check_apicid_present, - .vector_allocation_domain = numaq_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = numaq_init_apic_ldr, .ioapic_phys_id_map = numaq_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 71b6ac48ab2..2c6f003b2e4 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void) #endif } -static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* - * Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* should be called last. */ static int probe_default(void) { @@ -105,7 +90,7 @@ static struct apic apic_default = { .check_apicid_used = default_check_apicid_used, .check_apicid_present = default_check_apicid_present, - .vector_allocation_domain = default_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 659897c0075..35d254c1fec 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -320,20 +320,6 @@ static int probe_summit(void) return 0; } -static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - #ifdef CONFIG_X86_SUMMIT_NUMA static struct rio_table_hdr *rio_table_hdr; static struct scal_detail *scal_devs[MAX_NUMNODES]; @@ -509,7 +495,7 @@ static struct apic apic_summit = { .check_apicid_used = summit_check_apicid_used, .check_apicid_present = summit_check_apicid_present, - .vector_allocation_domain = summit_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = summit_init_apic_ldr, .ioapic_phys_id_map = summit_ioapic_phys_id_map, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f730269edef..f109388a0e8 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -88,15 +88,6 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -/* - * Each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static struct apic apic_x2apic_phys = { .name = "physical x2apic", @@ -114,7 +105,7 @@ static struct apic apic_x2apic_phys = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 16efb92bfea..df89a7d7874 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -185,12 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP @@ -363,7 +357,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = uv_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, -- cgit v1.2.3-70-g09d2 From 1bccd58bfffc5a677051937b332b71f0686187c1 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:15:15 +0200 Subject: x86/apic: Try to spread IRQ vectors to different priority levels When assigning a new vector it is primarially done by adding 8 to the previously given out vector number. Hence, two consequently allocated vector numbers would likely fall into the same priority level. Try to spread vector numbers to different priority levels better by changing the step from 8 to 16. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131514.GD4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 74c569791e7..05af3d341aa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1112,7 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 8; + static int current_offset = VECTOR_OFFSET_START % 16; unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; @@ -1148,10 +1148,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) vector = current_vector; offset = current_offset; next: - vector += 8; + vector += 16; if (vector >= first_system_vector) { - /* If out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; + offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } if (unlikely(current_vector == vector)) -- cgit v1.2.3-70-g09d2 From 8637e38aff14d048b649075114023023a2e80fba Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:15:44 +0200 Subject: x86/apic: Avoid useless scanning thru a cpumask in assign_irq_vector() In case of static vector allocation domains (i.e. flat) if all vector numbers are exhausted, an attempt to assign a new vector will lead to useless scans through all CPUs in the cpumask, even though it is known that each new pass would fail. Make this corner case less painful by letting report whether the vector allocation domain depends on passed arguments or not and stop scanning early. The same could have been achived by introducing a static flag to the apic operations. But let's allow vector_allocation_domain() have more intelligence here and decide dynamically, in case we would need it in the future. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131542.GE4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 8 +++++--- arch/x86/kernel/apic/apic_noop.c | 3 ++- arch/x86/kernel/apic/io_apic.c | 12 +++++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index feb2dbdae9e..e3fecd50d5c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,7 @@ struct apic { unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); + bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -615,7 +615,7 @@ extern unsigned int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask); -static inline void +static inline bool flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus @@ -628,12 +628,14 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask) */ cpumask_clear(retmask); cpumask_bits(retmask)[0] = APIC_ALL_CPUS; + return false; } -static inline void +static inline bool default_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_copy(retmask, cpumask_of(cpu)); + return true; } static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 3e43cf52893..ac9edf247b1 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,11 +100,12 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static bool noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); cpumask_copy(retmask, cpumask_of(cpu)); + return true; } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 05af3d341aa..4061a7dee5c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1137,8 +1137,9 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) for_each_cpu_and(cpu, mask, cpu_online_mask) { int new_cpu; int vector, offset; + bool more_domains; - apic->vector_allocation_domain(cpu, tmp_mask); + more_domains = apic->vector_allocation_domain(cpu, tmp_mask); if (cpumask_subset(tmp_mask, cfg->domain)) { free_cpumask_var(tmp_mask); @@ -1153,8 +1154,13 @@ next: offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } - if (unlikely(current_vector == vector)) - continue; + + if (unlikely(current_vector == vector)) { + if (more_domains) + continue; + else + break; + } if (test_bit(vector, used_vectors)) goto next; -- cgit v1.2.3-70-g09d2 From ff164324123c0fe181d8de7dadcc7b3fbe25f2cf Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:15:59 +0200 Subject: x86/apic: Make cpu_mask_to_apicid() operations return error code Current cpu_mask_to_apicid() and cpu_mask_to_apicid_and() implementations have few shortcomings: 1. A value returned by cpu_mask_to_apicid() is written to hardware registers unconditionally. Should BAD_APICID get ever returned it will be written to a hardware too. But the value of BAD_APICID is not universal across all hardware in all modes and might cause unexpected results, i.e. interrupts might get routed to CPUs that are not configured to receive it. 2. Because the value of BAD_APICID is not universal it is counter- intuitive to return it for a hardware where it does not make sense (i.e. x2apic). 3. cpu_mask_to_apicid_and() operation is thought as an complement to cpu_mask_to_apicid() that only applies a AND mask on top of a cpumask being passed. Yet, as consequence of 18374d8 commit the two operations are inconsistent in that of: cpu_mask_to_apicid() should not get a offline CPU with the cpumask cpu_mask_to_apicid_and() should not fail and return BAD_APICID These limitations are impossible to realize just from looking at the operations prototypes. Most of these shortcomings are resolved by returning a error code instead of BAD_APICID. As the result, faults are reported back early rather than possibilities to cause a unexpected behaviour exist (in case of [1]). The only exception is setup_timer_IRQ0_pin() routine. Although obviously controversial to this fix, its existing behaviour is preserved to not break the fragile check_timer() and would better addressed in a separate fix. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131559.GF4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 44 ++++++++++++------ arch/x86/kernel/apic/apic.c | 33 +++++++------ arch/x86/kernel/apic/es7000_32.c | 21 +++++---- arch/x86/kernel/apic/io_apic.c | 88 +++++++++++++++++++++++------------ arch/x86/kernel/apic/numaq_32.c | 14 ++++-- arch/x86/kernel/apic/summit_32.c | 22 +++++---- arch/x86/kernel/apic/x2apic_cluster.c | 24 ++++++---- arch/x86/kernel/apic/x2apic_uv_x.c | 27 +++++++---- arch/x86/platform/uv/uv_irq.c | 7 ++- drivers/iommu/intel_irq_remapping.c | 13 ++++-- 10 files changed, 188 insertions(+), 105 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index e3fecd50d5c..ae91f9c7e36 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -331,9 +331,11 @@ struct apic { unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; - unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); - unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, - const struct cpumask *andmask); + int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, + unsigned int *apicid); + int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid); /* ipi */ void (*send_IPI_mask)(const struct cpumask *mask, int vector); @@ -591,29 +593,45 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif -static inline unsigned int -flat_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +__flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) { - return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; + cpu_mask &= APIC_ALL_CPUS; + if (likely(cpu_mask)) { + *apicid = (unsigned int)cpu_mask; + return 0; + } else { + return -EINVAL; + } } -static inline unsigned int +static inline int +flat_cpu_mask_to_apicid(const struct cpumask *cpumask, + unsigned int *apicid) +{ + return __flat_cpu_mask_to_apicid(cpumask_bits(cpumask)[0], apicid); +} + +static inline int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { unsigned long mask1 = cpumask_bits(cpumask)[0]; unsigned long mask2 = cpumask_bits(andmask)[0]; unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; - return (unsigned int)(mask1 & mask2 & mask3); + return __flat_cpu_mask_to_apicid(mask1 & mask2 & mask3, apicid); } -extern unsigned int -default_cpu_mask_to_apicid(const struct cpumask *cpumask); +extern int +default_cpu_mask_to_apicid(const struct cpumask *cpumask, + unsigned int *apicid); -extern unsigned int +extern int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask); + const struct cpumask *andmask, + unsigned int *apicid); static inline bool flat_vector_allocation_domain(int cpu, struct cpumask *retmask) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 96a2608252f..b8d92606f84 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,24 +2123,26 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -unsigned int default_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int __default_cpu_to_apicid(int cpu, unsigned int *apicid) { - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if (likely((unsigned)cpu < nr_cpu_ids)) - return per_cpu(x86_cpu_to_apicid, cpu); + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + return 0; + } else { + return -EINVAL; + } +} - return BAD_APICID; +int default_cpu_mask_to_apicid(const struct cpumask *cpumask, + unsigned int *apicid) +{ + int cpu = cpumask_first(cpumask); + return __default_cpu_to_apicid(cpu, apicid); } -unsigned int -default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) { int cpu; @@ -2148,7 +2150,8 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu); + + return __default_cpu_to_apicid(cpu, apicid); } /* diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 3c42865757e..515ebb00a9f 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -525,7 +525,8 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid) return 1; } -static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; int cpu, uninitialized_var(apicid); @@ -539,31 +540,33 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); - return BAD_APICID; + return -EINVAL; } apicid = new_apicid; round++; } - return apicid; + *dest_id = apicid; + return 0; } -static unsigned int +static int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = es7000_cpu_mask_to_apicid(cpumask); + es7000_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4061a7dee5c..0deb773404e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1359,7 +1359,14 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), + &dest)) { + pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + __clear_irq_vector(irq, cfg); + + return; + } apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1474,6 +1481,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; + unsigned int dest; if (irq_remapping_enabled) return; @@ -1484,9 +1492,12 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, * We use logical delivery to get the timer IRQ * to the first CPU. */ + if (unlikely(apic->cpu_mask_to_apicid(apic->target_cpus(), &dest))) + dest = BAD_APICID; + entry.dest_mode = apic->irq_dest_mode; entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); + entry.dest = dest; entry.delivery_mode = apic->irq_delivery_mode; entry.polarity = 0; entry.trigger = 0; @@ -2245,16 +2256,25 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, unsigned int *dest_id) { struct irq_cfg *cfg = data->chip_data; + unsigned int irq = data->irq; + int err; if (!cpumask_intersects(mask, cpu_online_mask)) - return -1; + return -EINVAL; - if (assign_irq_vector(data->irq, data->chip_data, mask)) - return -1; + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } cpumask_copy(data->affinity, mask); - *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); return 0; } @@ -3040,7 +3060,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, if (err) return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; if (irq_remapped(cfg)) { compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); @@ -3361,6 +3384,8 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; + struct ht_irq_msg msg; + unsigned dest; int err; if (disable_apic) @@ -3368,36 +3393,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; + if (err) + return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus()); + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((apic->irq_dest_mode == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((apic->irq_dest_mode == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; - write_ht_irq_msg(irq, &msg); + write_ht_irq_msg(irq, &msg); - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); - dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); - } - return err; + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); + + return 0; } #endif /* CONFIG_HT_IRQ */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index eb2d466fd81..2b55514c328 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -406,16 +406,20 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +numaq_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { - return 0x0F; + *apicid = 0x0F; + return 0; } -static inline unsigned int +static int numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - return 0x0F; + *apicid = 0x0F; + return 0; } /* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 35d254c1fec..5766d84f12d 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -263,7 +263,8 @@ static int summit_check_phys_apicid_present(int physical_apicid) return 1; } -static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; int cpu, apicid = 0; @@ -276,30 +277,33 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { printk("%s: Not a valid mask!\n", __func__); - return BAD_APICID; + return -EINVAL; } apicid |= new_apicid; round++; } - return apicid; + *dest_id = apicid; + return 0; } -static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) +static int +summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = summit_cpu_mask_to_apicid(cpumask); + summit_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } /* diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 612622c47df..5f86f79335f 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -96,24 +96,26 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { int cpu = cpumask_first(cpumask); - u32 dest = 0; int i; - if (cpu > nr_cpu_ids) - return BAD_APICID; + if (cpu >= nr_cpu_ids) + return -EINVAL; + *apicid = 0; for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu)) - dest |= per_cpu(x86_cpu_to_logical_apicid, i); + *apicid |= per_cpu(x86_cpu_to_logical_apicid, i); - return dest; + return 0; } -static unsigned int +static int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { u32 dest = 0; u16 cluster; @@ -128,7 +130,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, } if (!dest) - return BAD_APICID; + return -EINVAL; for_each_cpu_and(i, cpumask, andmask) { if (!cpumask_test_cpu(i, cpu_online_mask)) @@ -138,7 +140,9 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, dest |= per_cpu(x86_cpu_to_logical_apicid, i); } - return dest; + *apicid = dest; + + return 0; } static void init_x2apic_ldr(void) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index df89a7d7874..2f3030fef31 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -269,23 +269,31 @@ static void uv_init_apic_ldr(void) { } -static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int __uv_cpu_to_apicid(int cpu, unsigned int *apicid) +{ + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + return 0; + } else { + return -EINVAL; + } +} + +static int +uv_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { /* * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - else - return BAD_APICID; + return __uv_cpu_to_apicid(cpu, apicid); } -static unsigned int +static int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { int cpu; @@ -297,7 +305,8 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + + return __uv_cpu_to_apicid(cpu, apicid); } static unsigned int x2apic_get_apic_id(unsigned long x) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index f25c2765a5c..dd1ff39a464 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; int mmr_pnode, err; + unsigned int dest; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; + err = apic->cpu_mask_to_apicid(eligible_cpu, &dest); + if (err != 0) + return err; + if (limit == UV_AFFINITY_CPU) irq_set_status_flags(irq, IRQ_NO_BALANCING); else @@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->dest = dest; mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 6d347064b8b..dafbad06390 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -924,6 +924,7 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_cfg *cfg = data->chip_data; unsigned int dest, irq = data->irq; struct irte irte; + int err; if (!cpumask_intersects(mask, cpu_online_mask)) return -EINVAL; @@ -931,10 +932,16 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, if (get_irte(irq, &irte)) return -EBUSY; - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); + err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)); + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); -- cgit v1.2.3-70-g09d2 From 4988a40c3981212fa8c64da68722affc1cb6697a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Jun 2012 15:16:25 +0200 Subject: x86/apic: Make cpu_mask_to_apicid() operations check cpu_online_mask Currently cpu_mask_to_apicid() should not get a offline CPU with the cpumask. Otherwise some apic drivers might try to access non-existent per-cpu variables (i.e. x2apic). In that regard cpu_mask_to_apicid() and cpu_mask_to_apicid_and() operations are inconsistent. This fix makes the two operations do not rely on calling functions and always return the apicid for only online CPUs. As result, the meaning and implementations of cpu_mask_to_apicid() and cpu_mask_to_apicid_and() operations become straight. Signed-off-by: Alexander Gordeev Acked-by: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120607131624.GG4759@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 6 ++---- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/es7000_32.c | 3 +-- arch/x86/kernel/apic/summit_32.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 6 files changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ae91f9c7e36..1ed3eead203 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -596,7 +596,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) static inline int __flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) { - cpu_mask &= APIC_ALL_CPUS; + cpu_mask = cpu_mask & APIC_ALL_CPUS & cpumask_bits(cpu_online_mask)[0]; if (likely(cpu_mask)) { *apicid = (unsigned int)cpu_mask; return 0; @@ -619,9 +619,7 @@ flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, { unsigned long mask1 = cpumask_bits(cpumask)[0]; unsigned long mask2 = cpumask_bits(andmask)[0]; - unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; - - return __flat_cpu_mask_to_apicid(mask1 & mask2 & mask3, apicid); + return __flat_cpu_mask_to_apicid(mask1 & mask2, apicid); } extern int diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b8d92606f84..7e9bbe73bc5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2136,7 +2136,7 @@ static inline int __default_cpu_to_apicid(int cpu, unsigned int *apicid) int default_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { - int cpu = cpumask_first(cpumask); + int cpu = cpumask_first_and(cpumask, cpu_online_mask); return __default_cpu_to_apicid(cpu, apicid); } diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 515ebb00a9f..b35cfb9b696 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -534,7 +534,7 @@ es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { @@ -561,7 +561,6 @@ es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); es7000_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 5766d84f12d..79d360f6729 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -272,7 +272,7 @@ summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { @@ -298,7 +298,6 @@ summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); summit_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 5f86f79335f..23a46cf5b6f 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -99,7 +99,7 @@ static void x2apic_send_IPI_all(int vector) static int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) { - int cpu = cpumask_first(cpumask); + int cpu = cpumask_first_and(cpumask, cpu_online_mask); int i; if (cpu >= nr_cpu_ids) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2f3030fef31..307aa076bd6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -286,7 +286,7 @@ uv_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - int cpu = cpumask_first(cpumask); + int cpu = cpumask_first_and(cpumask, cpu_online_mask); return __uv_cpu_to_apicid(cpu, apicid); } -- cgit v1.2.3-70-g09d2 From 7eb9ba5ed312ec6ed9d22259c5da1acb7cf4bd29 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 8 Jun 2012 15:02:57 +0530 Subject: uprobes: Pass probed vaddr to arch_uprobe_analyze_insn() On RISC architectures like powerpc, instructions are fixed size. Instruction analysis on such platforms is just a matter of (insn % 4). Pass the vaddr at which the uprobe is to be inserted so that arch_uprobe_analyze_insn() can flag misaligned registration requests. Signed-off-by: Ananth N Mavinakaynahalli Cc: michael@ellerman.id.au Cc: antonb@thinktux.localdomain Cc: Paul Mackerras Cc: benh@kernel.crashing.org Cc: peterz@infradead.org Cc: Srikar Dronamraju Cc: Jim Keniston Cc: oleg@redhat.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20120608093257.GG13409@in.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 2 +- arch/x86/kernel/uprobes.c | 3 ++- kernel/events/uprobes.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1e9bed14f7a..f3971bbcd1d 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -48,7 +48,7 @@ struct arch_uprobe_task { #endif }; -extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index dc4e910a7d9..36fd42091fa 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { int ret; struct insn insn; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c5e043cd30..b52376d0233 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -706,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); if (ret) return ret; -- cgit v1.2.3-70-g09d2 From c7d65a78fc18ed70353baeb7497ec71a7c775ac5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jun 2012 11:03:00 -0400 Subject: x86: Remove cmpxchg from i386 NMI nesting code I've been informed by someone on LWN called 'slashdot' that some i386 machines do not support a true cmpxchg. The cmpxchg used by the i386 NMI nesting code must be a true cmpxchg as disabling interrupts will not work for NMIs (which is the work around for i386s that do not have a true cmpxchg). This 'slashdot' character also suggested a fix to the issue. As the state of the nesting NMIs goes as follows: NOT_RUNNING -> EXECUTING EXECUTING -> NOT_RUNNING EXECUTING -> LATCHED LATCHED -> EXECUTING Having these states as enum values of: NOT_RUNNING = 0 EXECUTING = 1 LATCHED = 2 Instead of a cmpxchg to make EXECUTING -> NOT_RUNNING a dec_and_test() would work as well. If the dec_and_test brings the state to NOT_RUNNING, that is the same as a cmpxchg succeeding to change EXECUTING to NOT_RUNNING. If a nested NMI were to come in and change it to LATCHED, the dec_and_test() would convert the state to EXECUTING (what we want it to be in such a case anyway). I asked 'slashdot' to post this as a patch, but it never came to be. I decided to do the work instead. Thanks to H. Peter Anvin for suggesting to use this_cpu_dec_and_return() instead of local_dec_and_test(&__get_cpu_var()). Link: http://lwn.net/Articles/484932/ Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0b2f84457b..a15a8880066 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) #ifdef CONFIG_X86_32 /* * For i386, NMIs use the same stack as the kernel, and we can - * add a workaround to the iret problem in C. Simply have 3 states - * the NMI can be in. + * add a workaround to the iret problem in C (preventing nested + * NMIs if an NMI takes a trap). Simply have 3 states the NMI + * can be in: * * 1) not running * 2) executing @@ -383,13 +384,20 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) * If an NMI hits a breakpoint that executes an iret, another * NMI can preempt it. We do not want to allow this new NMI * to run, but we want to execute it when the first one finishes. - * We set the state to "latched", and the first NMI will perform - * an cmpxchg on the state, and if it doesn't successfully - * reset the state to "not running" it will restart the next - * NMI. + * We set the state to "latched", and the exit of the first NMI will + * perform a dec_return, if the result is zero (NOT_RUNNING), then + * it will simply exit the NMI handler. If not, the dec_return + * would have set the state to NMI_EXECUTING (what we want it to + * be when we are running). In this case, we simply jump back + * to rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. */ enum nmi_states { - NMI_NOT_RUNNING, + NMI_NOT_RUNNING = 0, NMI_EXECUTING, NMI_LATCHED, }; @@ -397,18 +405,17 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state); #define nmi_nesting_preprocess(regs) \ do { \ - if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ - __get_cpu_var(nmi_state) = NMI_LATCHED; \ + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ + this_cpu_write(nmi_state, NMI_LATCHED); \ return; \ } \ - nmi_restart: \ - __get_cpu_var(nmi_state) = NMI_EXECUTING; \ - } while (0) + this_cpu_write(nmi_state, NMI_EXECUTING); \ + } while (0); \ + nmi_restart: #define nmi_nesting_postprocess() \ do { \ - if (cmpxchg(&__get_cpu_var(nmi_state), \ - NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + if (this_cpu_dec_return(nmi_state)) \ goto nmi_restart; \ } while (0) #else /* x86_64 */ -- cgit v1.2.3-70-g09d2 From 70fb74a5420f9caa3e001d65004e4b669124283e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jun 2012 11:54:37 -0400 Subject: x86: Save cr2 in NMI in case NMIs take a page fault (for i386) Avi Kivity reported that page faults in NMIs could cause havic if the NMI preempted another page fault handler: The recent changes to NMI allow exceptions to take place in NMI handlers, but I think that a #PF (say, due to access to vmalloc space) is still problematic. Consider the sequence #PF (cr2 set by processor) NMI ... #PF (cr2 clobbered) do_page_fault() IRET ... IRET do_page_fault() address = read_cr2() The last line reads the overwritten cr2 value. This is the i386 version, which has the luxury of doing the work in C code. Link: http://lkml.kernel.org/r/4FBB8C40.6080304@redhat.com Reported-by: Avi Kivity Cc: Linus Torvalds Cc: H. Peter Anvin Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a15a8880066..f84f5c57de3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -395,6 +395,14 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) * thus there is no race between the first check of state for NOT_RUNNING * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. */ enum nmi_states { NMI_NOT_RUNNING = 0, @@ -402,6 +410,7 @@ enum nmi_states { NMI_LATCHED, }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); #define nmi_nesting_preprocess(regs) \ do { \ @@ -410,11 +419,14 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state); return; \ } \ this_cpu_write(nmi_state, NMI_EXECUTING); \ + this_cpu_write(nmi_cr2, read_cr2()); \ } while (0); \ nmi_restart: #define nmi_nesting_postprocess() \ do { \ + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ + write_cr2(this_cpu_read(nmi_cr2)); \ if (this_cpu_dec_return(nmi_state)) \ goto nmi_restart; \ } while (0) -- cgit v1.2.3-70-g09d2 From e2b297fcf17fc03734e93387fb8195c782286b35 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Sun, 10 Jun 2012 21:13:41 -0600 Subject: perf/x86: Convert obsolete simple_strtoul() usage to kstrtoul() Signed-off-by: Shuah Khan Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1339384421.3025.8.camel@lorien2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 000a4746c7c..766c76d5ec4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1640,7 +1640,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long val = simple_strtoul(buf, NULL, 0); + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; -- cgit v1.2.3-70-g09d2 From 110c1e1f1bf61e5dca53ff5c9dc75243ce87c002 Mon Sep 17 00:00:00 2001 From: Ravikiran Thirumalai Date: Sun, 3 Jun 2012 01:11:35 +0300 Subject: x86/vsmp: Ignore IOAPIC IRQ affinity if possible vSMP can route interrupts more optimally based on internal knowledge the OS does not have. In order to support this optimization, all CPUs must be able to handle all possible IOAPIC interrupts. Fix this by setting the vector allocation domain for all CPUs and by enabling this feature in vSMP. Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim [ Rebased, simplified, and reworded the commit message. ] Signed-off-by: Ido Yariv Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 59eea855f45..6b96a7374f9 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -95,6 +96,15 @@ static void __init set_vsmp_pv_ops(void) ctl = readl(address + 4); printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); + + /* If possible, let the vSMP foundation route the interrupt optimally */ +#ifdef CONFIG_SMP + if (cap & ctl & BIT(8)) { + ctl &= ~BIT(8); + no_irq_affinity = 1; + } +#endif + if (cap & ctl & (1 << 4)) { /* Setup irq ops and turn on vSMP IRQ fastpath handling */ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); @@ -102,12 +112,11 @@ static void __init set_vsmp_pv_ops(void) pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); pv_init_ops.patch = vsmp_patch; - ctl &= ~(1 << 4); - writel(ctl, address + 4); - ctl = readl(address + 4); - printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl); } + writel(ctl, address + 4); + ctl = readl(address + 4); + pr_info("vSMP CTL: control set to:0x%08x\n", ctl); early_iounmap(address, 8); } @@ -192,10 +201,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) return hard_smp_processor_id() >> index_msb; } +/* + * In vSMP, all cpus should be capable of handling interrupts, regardless of + * the APIC used. + */ +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_setall(retmask); +} + static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; + apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) -- cgit v1.2.3-70-g09d2 From 80feb89a0a1381642f1cce9036ef3bb22f13b40a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 29 May 2012 23:54:26 +0900 Subject: KVM: MMU: Remove unused parameter from mmu_memory_cache_alloc() Size is not needed to return one from pre-allocated objects. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 24dd43d45ae..b32a11dc884 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -652,8 +652,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_page_header_cache); } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; @@ -664,8 +663,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, - sizeof(struct pte_list_desc)); + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1403,12 +1401,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, - sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); -- cgit v1.2.3-70-g09d2 From 65df57743924c3d13e1fa1bcf5bf70fe874fcdfd Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 24 May 2012 11:13:42 +0200 Subject: crypto: sha1 - use Kbuild supplied flags for AVX test Commit ea4d26ae ("raid5: add AVX optimized RAID5 checksumming") introduced x86/ arch wide defines for AFLAGS and CFLAGS indicating AVX support in binutils based on the same test we have in x86/crypto/ right now. To minimize duplication drop our implementation in favour to the one in x86/. Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 7 ------- arch/x86/crypto/sha1_ssse3_asm.S | 2 +- arch/x86/crypto/sha1_ssse3_glue.c | 6 +++--- 3 files changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e191ac048b5..479f95a744f 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -34,12 +34,5 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o - ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o - -# enable AVX support only when $(AS) can actually assemble the instructions -ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes) -AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT -CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT -endif sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index b2c2f57d70e..49d6987a73d 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -468,7 +468,7 @@ W_PRECALC_SSSE3 */ SHA1_VECTOR_ASM sha1_transform_ssse3 -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX .macro W_PRECALC_AVX diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index f916499d0ab..4a11a9d7245 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -35,7 +35,7 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, unsigned int rounds); -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX asmlinkage void sha1_transform_avx(u32 *digest, const char *data, unsigned int rounds); #endif @@ -184,7 +184,7 @@ static struct shash_alg alg = { } }; -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { u64 xcr0; @@ -209,7 +209,7 @@ static int __init sha1_ssse3_mod_init(void) if (cpu_has_ssse3) sha1_transform_asm = sha1_transform_ssse3; -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX /* allow AVX to override SSSE3, it's a little faster */ if (avx_usable()) sha1_transform_asm = sha1_transform_avx; -- cgit v1.2.3-70-g09d2 From 107778b592576c0c8e8d2ca7a2aa5415a4908223 Mon Sep 17 00:00:00 2001 From: Johannes Goetzfried Date: Mon, 28 May 2012 15:54:24 +0200 Subject: crypto: twofish - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 301 ++++++++ arch/x86/crypto/twofish_avx_glue.c | 1086 +++++++++++++++++++++++++++ arch/x86/crypto/twofish_glue_3way.c | 2 + crypto/Kconfig | 24 + crypto/tcrypt.c | 23 + crypto/testmgr.c | 60 ++ 7 files changed, 1498 insertions(+) create mode 100644 arch/x86/crypto/twofish-avx-x86_64-asm_64.S create mode 100644 arch/x86/crypto/twofish_avx_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 479f95a744f..3420feef0c7 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o +obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o @@ -30,6 +31,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o +twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S new file mode 100644 index 00000000000..fc31b89ba4c --- /dev/null +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -0,0 +1,301 @@ +/* + * Twofish Cipher 8-way parallel algorithm (AVX/x86_64) + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "twofish-avx-x86_64-asm_64.S" +.text + +/* structure of crypto context */ +#define s0 0 +#define s1 1024 +#define s2 2048 +#define s3 3072 +#define w 4096 +#define k 4128 + +/********************************************************************** + 8-way AVX twofish + **********************************************************************/ +#define CTX %rdi + +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 + +#define RA2 %xmm4 +#define RB2 %xmm5 +#define RC2 %xmm6 +#define RD2 %xmm7 + +#define RX %xmm8 +#define RY %xmm9 + +#define RK1 %xmm10 +#define RK2 %xmm11 + +#define RID1 %rax +#define RID1b %al +#define RID2 %rbx +#define RID2b %bl + +#define RGI1 %rdx +#define RGI1bl %dl +#define RGI1bh %dh +#define RGI2 %rcx +#define RGI2bl %cl +#define RGI2bh %ch + +#define RGS1 %r8 +#define RGS1d %r8d +#define RGS2 %r9 +#define RGS2d %r9d +#define RGS3 %r10 +#define RGS3d %r10d + + +#define lookup_32bit(t0, t1, t2, t3, src, dst) \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + movl t0(CTX, RID1, 4), dst ## d; \ + xorl t1(CTX, RID2, 4), dst ## d; \ + shrq $16, src; \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + xorl t2(CTX, RID1, 4), dst ## d; \ + xorl t3(CTX, RID2, 4), dst ## d; + +#define G(a, x, t0, t1, t2, t3) \ + vmovq a, RGI1; \ + vpsrldq $8, a, x; \ + vmovq x, RGI2; \ + \ + lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \ + shrq $16, RGI1; \ + lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \ + shlq $32, RGS2; \ + orq RGS1, RGS2; \ + \ + lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \ + shrq $16, RGI2; \ + lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \ + shlq $32, RGS3; \ + orq RGS1, RGS3; \ + \ + vmovq RGS2, x; \ + vpinsrq $1, RGS3, x, x; + +#define encround(a, b, c, d, x, y) \ + G(a, x, s0, s1, s2, s3); \ + G(b, y, s1, s2, s3, s0); \ + vpaddd x, y, x; \ + vpaddd y, x, y; \ + vpaddd x, RK1, x; \ + vpaddd y, RK2, y; \ + vpxor x, c, c; \ + vpsrld $1, c, x; \ + vpslld $(32 - 1), c, c; \ + vpor c, x, c; \ + vpslld $1, d, x; \ + vpsrld $(32 - 1), d, d; \ + vpor d, x, d; \ + vpxor d, y, d; + +#define decround(a, b, c, d, x, y) \ + G(a, x, s0, s1, s2, s3); \ + G(b, y, s1, s2, s3, s0); \ + vpaddd x, y, x; \ + vpaddd y, x, y; \ + vpaddd y, RK2, y; \ + vpxor d, y, d; \ + vpsrld $1, d, y; \ + vpslld $(32 - 1), d, d; \ + vpor d, y, d; \ + vpslld $1, c, y; \ + vpsrld $(32 - 1), c, c; \ + vpor c, y, c; \ + vpaddd x, RK1, x; \ + vpxor x, c, c; + +#define encrypt_round(n, a, b, c, d) \ + vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ + vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ + encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ + encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); + +#define decrypt_round(n, a, b, c, d) \ + vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ + vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ + decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ + decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); + +#define encrypt_cycle(n) \ + encrypt_round((2*n), RA, RB, RC, RD); \ + encrypt_round(((2*n) + 1), RC, RD, RA, RB); + +#define decrypt_cycle(n) \ + decrypt_round(((2*n) + 1), RC, RD, RA, RB); \ + decrypt_round((2*n), RA, RB, RC, RD); + + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x3; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; \ + vpunpcklqdq x3, t2, x2; \ + vpunpckhqdq x3, t2, x3; + +#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ + vpxor (0*4*4)(in), wkey, x0; \ + vpxor (1*4*4)(in), wkey, x1; \ + vpxor (2*4*4)(in), wkey, x2; \ + vpxor (3*4*4)(in), wkey, x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpxor x0, wkey, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor x1, wkey, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor x2, wkey, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor x3, wkey, x3; \ + vmovdqu x3, (3*4*4)(out); + +#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpxor x0, wkey, x0; \ + vpxor (0*4*4)(out), x0, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor x1, wkey, x1; \ + vpxor (1*4*4)(out), x1, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor x2, wkey, x2; \ + vpxor (2*4*4)(out), x2, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor x3, wkey, x3; \ + vpxor (3*4*4)(out), x3, x3; \ + vmovdqu x3, (3*4*4)(out); + +.align 8 +.global __twofish_enc_blk_8way +.type __twofish_enc_blk_8way,@function; + +__twofish_enc_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + pushq %rbx; + pushq %rcx; + + vmovdqu w(CTX), RK1; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + + xorq RID1, RID1; + xorq RID2, RID2; + + encrypt_cycle(0); + encrypt_cycle(1); + encrypt_cycle(2); + encrypt_cycle(3); + encrypt_cycle(4); + encrypt_cycle(5); + encrypt_cycle(6); + encrypt_cycle(7); + + vmovdqu (w+4*4)(CTX), RK1; + + popq %rcx; + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + leaq (4*4*4)(%rax), %rdx; + + testb %cl, %cl; + jnz __enc_xor8; + + outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); + outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + + ret; + +__enc_xor8: + outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); + outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + + ret; + +.align 8 +.global twofish_dec_blk_8way +.type twofish_dec_blk_8way,@function; + +twofish_dec_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %rbx; + + vmovdqu (w+4*4)(CTX), RK1; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); + inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + + xorq RID1, RID1; + xorq RID2, RID2; + + decrypt_cycle(7); + decrypt_cycle(6); + decrypt_cycle(5); + decrypt_cycle(4); + decrypt_cycle(3); + decrypt_cycle(2); + decrypt_cycle(1); + decrypt_cycle(0); + + vmovdqu (w)(CTX), RK1; + + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + + ret; diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c new file mode 100644 index 00000000000..599f19e4bef --- /dev/null +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -0,0 +1,1086 @@ +/* + * Glue Code for AVX assembler version of Twofish Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * Glue code based on serpent_sse2_glue.c by: + * Copyright (C) 2011 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define TWOFISH_PARALLEL_BLOCKS 8 + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +/* 3-way parallel cipher functions from twofish_x86_64-3way module */ +asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_3way(ctx, dst, src, false); +} + +static inline void twofish_enc_blk_3way_xor(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_3way(ctx, dst, src, true); +} + +/* 8-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_8way(ctx, dst, src, false); +} + +static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_8way(ctx, dst, src, true); +} + +static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + twofish_dec_blk_8way(ctx, dst, src); +} + + + +struct async_twofish_ctx { + struct cryptd_ablkcipher *cryptd_tfm; +}; + +static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + if (fpu_enabled) + return true; + + /* AVX is only used when chunk to be processed is large enough, so + * do not enable FPU until it is necessary. + */ + if (nbytes < TF_BLOCK_SIZE * TWOFISH_PARALLEL_BLOCKS) + return false; + + kernel_fpu_begin(); + return true; +} + +static inline void twofish_fpu_end(bool fpu_enabled) +{ + if (fpu_enabled) + kernel_fpu_end(); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + bool enc) +{ + bool fpu_enabled = false; + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = twofish_fpu_begin(fpu_enabled, nbytes); + + /* Process multi-block batch */ + if (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS) { + do { + if (enc) + twofish_enc_blk_xway(ctx, wdst, wsrc); + else + twofish_dec_blk_xway(ctx, wdst, wsrc); + + wsrc += bsize * TWOFISH_PARALLEL_BLOCKS; + wdst += bsize * TWOFISH_PARALLEL_BLOCKS; + nbytes -= bsize * TWOFISH_PARALLEL_BLOCKS; + } while (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + if (enc) + twofish_enc_blk_3way(ctx, wdst, wsrc); + else + twofish_dec_blk_3way(ctx, wdst, wsrc); + + wsrc += bsize * 3; + wdst += bsize * 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (enc) + twofish_enc_blk(ctx, wdst, wsrc); + else + twofish_dec_blk(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + twofish_fpu_end(fpu_enabled); + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, true); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, false); +} + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; + u128 last_iv; + int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process multi-block batch */ + if (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS) { + do { + nbytes -= bsize * (TWOFISH_PARALLEL_BLOCKS - 1); + src -= TWOFISH_PARALLEL_BLOCKS - 1; + dst -= TWOFISH_PARALLEL_BLOCKS - 1; + + for (i = 0; i < TWOFISH_PARALLEL_BLOCKS - 1; i++) + ivs[i] = src[i]; + + twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (i = 0; i < TWOFISH_PARALLEL_BLOCKS - 1; i++) + u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + nbytes -= bsize * (3 - 1); + src -= 3 - 1; + dst -= 3 - 1; + + ivs[0] = src[0]; + ivs[1] = src[1]; + + twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + + u128_xor(dst + 1, dst + 1, ivs + 0); + u128_xor(dst + 2, dst + 2, ivs + 1); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes)) { + fpu_enabled = twofish_fpu_begin(fpu_enabled, nbytes); + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + twofish_fpu_end(fpu_enabled); + return err; +} + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *ctrblk = walk->iv; + u8 keystream[TF_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + twofish_enc_blk(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, TF_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + be128 ctrblocks[TWOFISH_PARALLEL_BLOCKS]; + int i; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process multi-block batch */ + if (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS) { + do { + /* create ctrblks for parallel encrypt */ + for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblocks[i], &ctrblk); + u128_inc(&ctrblk); + } + + twofish_enc_blk_xway_xor(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += TWOFISH_PARALLEL_BLOCKS; + dst += TWOFISH_PARALLEL_BLOCKS; + nbytes -= bsize * TWOFISH_PARALLEL_BLOCKS; + } while (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + } + + /* create ctrblks for parallel encrypt */ + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + u128_to_be128(&ctrblocks[1], &ctrblk); + u128_inc(&ctrblk); + u128_to_be128(&ctrblocks[2], &ctrblk); + u128_inc(&ctrblk); + + twofish_enc_blk_3way_xor(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += 3; + dst += 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + + twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + u128_xor(dst, dst, (u128 *)ctrblocks); + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { + fpu_enabled = twofish_fpu_begin(fpu_enabled, nbytes); + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + twofish_fpu_end(fpu_enabled); + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +struct crypt_priv { + struct twofish_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { + twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) + twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); + + nbytes %= bsize * 3; + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_enc_blk(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { + twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) + twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); + + nbytes %= bsize * 3; + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_dec_blk(ctx->ctx, srcdst, srcdst); +} + +struct twofish_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct twofish_ctx twofish_ctx; +}; + +static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __twofish_setkey(&ctx->twofish_ctx, key, + keylen - TF_BLOCK_SIZE, &tfm->crt_flags); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - + TF_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->twofish_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->twofish_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +struct twofish_xts_ctx { + struct twofish_ctx tweak_ctx; + struct twofish_ctx crypt_ctx; +}; + +static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __twofish_setkey(&ctx->tweak_ctx, + key + keylen / 2, keylen / 2, flags); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; +} + +static int __ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->encrypt( + &desc, req->dst, req->src, req->nbytes); +} + +static int ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_encrypt(cryptd_req); + } else { + return __ablk_encrypt(req); + } +} + +static int ablk_decrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_decrypt(cryptd_req); + } else { + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->decrypt( + &desc, req->dst, req->src, req->nbytes); + } +} + +static void ablk_exit(struct crypto_tfm *tfm) +{ + struct async_twofish_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ablkcipher(ctx->cryptd_tfm); +} + +static int ablk_init(struct crypto_tfm *tfm) +{ + struct async_twofish_ctx *ctx = crypto_tfm_ctx(tfm); + struct cryptd_ablkcipher *cryptd_tfm; + char drv_name[CRYPTO_MAX_ALG_NAME]; + + snprintf(drv_name, sizeof(drv_name), "__driver-%s", + crypto_tfm_alg_driver_name(tfm)); + + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); + + return 0; +} + +static struct crypto_alg twofish_algs[10] = { { + .cra_name = "__ecb-twofish-avx", + .cra_driver_name = "__driver-ecb-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[0].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-twofish-avx", + .cra_driver_name = "__driver-cbc-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-twofish-avx", + .cra_driver_name = "__driver-ctr-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-twofish-avx", + .cra_driver_name = "__driver-lrw-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[3].cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE + + TF_BLOCK_SIZE, + .max_keysize = TF_MAX_KEY_SIZE + + TF_BLOCK_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = lrw_twofish_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-twofish-avx", + .cra_driver_name = "__driver-xts-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[4].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE * 2, + .max_keysize = TF_MAX_KEY_SIZE * 2, + .ivsize = TF_BLOCK_SIZE, + .setkey = xts_twofish_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(twofish)", + .cra_driver_name = "ecb-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[5].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(twofish)", + .cra_driver_name = "cbc-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[6].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(twofish)", + .cra_driver_name = "ctr-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[7].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(twofish)", + .cra_driver_name = "lrw-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[8].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE + + TF_BLOCK_SIZE, + .max_keysize = TF_MAX_KEY_SIZE + + TF_BLOCK_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(twofish)", + .cra_driver_name = "xts-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[9].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE * 2, + .max_keysize = TF_MAX_KEY_SIZE * 2, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init twofish_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) { + printk(KERN_INFO "AVX instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + printk(KERN_INFO "AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); +} + +static void __exit twofish_exit(void) +{ + crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); +} + +module_init(twofish_init); +module_exit(twofish_exit); + +MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("twofish"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 922ab24cce3..77e4e55a266 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -45,8 +45,10 @@ asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, /* 3-way parallel cipher functions */ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, bool xor); +EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(twofish_dec_blk_3way); static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) diff --git a/crypto/Kconfig b/crypto/Kconfig index 8e84225c096..e00a4e49e01 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -913,6 +913,30 @@ config CRYPTO_TWOFISH_X86_64_3WAY See also: +config CRYPTO_TWOFISH_AVX_X86_64 + tristate "Twofish cipher algorithm (x86_64/AVX)" + depends on X86 && 64BIT + select CRYPTO_ALGAPI + select CRYPTO_CRYPTD + select CRYPTO_TWOFISH_COMMON + select CRYPTO_TWOFISH_X86_64 + select CRYPTO_TWOFISH_X86_64_3WAY + select CRYPTO_LRW + select CRYPTO_XTS + help + Twofish cipher algorithm (x86_64/AVX). + + Twofish was submitted as an AES (Advanced Encryption Standard) + candidate cipher by researchers at CounterPane Systems. It is a + 16 round block cipher supporting key sizes of 128, 192, and 256 + bits. + + This module provides the Twofish cipher algorithm that processes + eight blocks parallel using the AVX Instruction Set. + + See also: + + comment "Compression" config CRYPTO_DEFLATE diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 750cce44bad..2af879786e7 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1566,6 +1566,29 @@ static int do_test(int m) speed_template_32_64); break; + case 504: + test_acipher_speed("ecb(twofish)", ENCRYPT, sec, NULL, 0, + speed_template_16_24_32); + test_acipher_speed("ecb(twofish)", DECRYPT, sec, NULL, 0, + speed_template_16_24_32); + test_acipher_speed("cbc(twofish)", ENCRYPT, sec, NULL, 0, + speed_template_16_24_32); + test_acipher_speed("cbc(twofish)", DECRYPT, sec, NULL, 0, + speed_template_16_24_32); + test_acipher_speed("ctr(twofish)", ENCRYPT, sec, NULL, 0, + speed_template_16_24_32); + test_acipher_speed("ctr(twofish)", DECRYPT, sec, NULL, 0, + speed_template_16_24_32); + test_acipher_speed("lrw(twofish)", ENCRYPT, sec, NULL, 0, + speed_template_32_40_48); + test_acipher_speed("lrw(twofish)", DECRYPT, sec, NULL, 0, + speed_template_32_40_48); + test_acipher_speed("xts(twofish)", ENCRYPT, sec, NULL, 0, + speed_template_32_48_64); + test_acipher_speed("xts(twofish)", DECRYPT, sec, NULL, 0, + speed_template_32_48_64); + break; + case 1000: test_available(); break; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index eb6d20f8ec5..73b3ec6fe1a 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1548,6 +1548,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__cbc-twofish-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__driver-cbc-aes-aesni", .test = alg_test_null, @@ -1578,6 +1593,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__driver-cbc-twofish-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__driver-ecb-aes-aesni", .test = alg_test_null, @@ -1608,6 +1638,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__driver-ecb-twofish-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__ghash-pclmulqdqni", .test = alg_test_null, @@ -1805,6 +1850,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "cryptd(__driver-ecb-twofish-avx)", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "cryptd(__ghash-pclmulqdqni)", .test = alg_test_null, -- cgit v1.2.3-70-g09d2 From 7efe4076725aeb01722445b56613681aa492c8d6 Mon Sep 17 00:00:00 2001 From: Johannes Goetzfried Date: Tue, 12 Jun 2012 16:47:43 +0800 Subject: crypto: serpent - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Serpent block cipher. The implementation is very similar to the sse2 implementation and processes eight blocks in parallel. Because of the new non-destructive three operand syntax all move-instructions can be removed and therefore a little performance increase is provided. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) serpent-avx-x86_64 vs. serpent-sse2-x86_64 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.03x 1.01x 1.01x 1.01x 1.00x 1.00x 1.00x 1.00x 1.00x 1.01x 64B 1.00x 1.00x 1.00x 1.00x 1.00x 0.99x 1.00x 1.01x 1.00x 1.00x 256B 1.05x 1.03x 1.00x 1.02x 1.05x 1.06x 1.05x 1.02x 1.05x 1.02x 1024B 1.05x 1.02x 1.00x 1.02x 1.05x 1.06x 1.05x 1.03x 1.05x 1.02x 8192B 1.05x 1.02x 1.00x 1.02x 1.06x 1.06x 1.04x 1.03x 1.04x 1.02x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.01x 1.00x 1.01x 1.01x 1.00x 1.00x 0.99x 1.03x 1.01x 1.01x 64B 1.00x 1.00x 1.00x 1.00x 1.00x 1.00x 1.00x 1.01x 1.00x 1.02x 256B 1.05x 1.02x 1.00x 1.02x 1.05x 1.02x 1.04x 1.05x 1.05x 1.02x 1024B 1.06x 1.02x 1.00x 1.02x 1.07x 1.06x 1.05x 1.04x 1.05x 1.02x 8192B 1.05x 1.02x 1.00x 1.02x 1.06x 1.06x 1.04x 1.05x 1.05x 1.02x serpent-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.26x 1.73x ecb-dec 1.20x 1.64x cbc-enc 0.33x 0.45x cbc-dec 1.24x 1.67x ctr-enc 1.32x 1.76x ctr-dec 1.32x 1.76x lrw-enc 1.20x 1.60x lrw-dec 1.15x 1.54x xts-enc 1.22x 1.64x xts-dec 1.17x 1.57x Signed-off-by: Johannes Goetzfried Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 704 +++++++++++++++++++++ arch/x86/crypto/serpent_avx_glue.c | 949 ++++++++++++++++++++++++++++ crypto/Kconfig | 20 + crypto/testmgr.c | 60 ++ 5 files changed, 1735 insertions(+) create mode 100644 arch/x86/crypto/serpent-avx-x86_64-asm_64.S create mode 100644 arch/x86/crypto/serpent_avx_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3420feef0c7..83caa4b948c 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o +obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -34,6 +35,7 @@ twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S new file mode 100644 index 00000000000..0ed47a124ba --- /dev/null +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -0,0 +1,704 @@ +/* + * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by + * Copyright (C) 2011 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "serpent-avx-x86_64-asm_64.S" +.text + +#define CTX %rdi + +/********************************************************************** + 8-way AVX serpent + **********************************************************************/ +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 +#define RE1 %xmm4 + +#define tp %xmm5 + +#define RA2 %xmm6 +#define RB2 %xmm7 +#define RC2 %xmm8 +#define RD2 %xmm9 +#define RE2 %xmm10 + +#define RNOT %xmm11 + +#define RK0 %xmm12 +#define RK1 %xmm13 +#define RK2 %xmm14 +#define RK3 %xmm15 + + +#define S0_1(x0, x1, x2, x3, x4) \ + vpor x0, x3, tp; \ + vpxor x3, x0, x0; \ + vpxor x2, x3, x4; \ + vpxor RNOT, x4, x4; \ + vpxor x1, tp, x3; \ + vpand x0, x1, x1; \ + vpxor x4, x1, x1; \ + vpxor x0, x2, x2; +#define S0_2(x0, x1, x2, x3, x4) \ + vpxor x3, x0, x0; \ + vpor x0, x4, x4; \ + vpxor x2, x0, x0; \ + vpand x1, x2, x2; \ + vpxor x2, x3, x3; \ + vpxor RNOT, x1, x1; \ + vpxor x4, x2, x2; \ + vpxor x2, x1, x1; + +#define S1_1(x0, x1, x2, x3, x4) \ + vpxor x0, x1, tp; \ + vpxor x3, x0, x0; \ + vpxor RNOT, x3, x3; \ + vpand tp, x1, x4; \ + vpor tp, x0, x0; \ + vpxor x2, x3, x3; \ + vpxor x3, x0, x0; \ + vpxor x3, tp, x1; +#define S1_2(x0, x1, x2, x3, x4) \ + vpxor x4, x3, x3; \ + vpor x4, x1, x1; \ + vpxor x2, x4, x4; \ + vpand x0, x2, x2; \ + vpxor x1, x2, x2; \ + vpor x0, x1, x1; \ + vpxor RNOT, x0, x0; \ + vpxor x2, x0, x0; \ + vpxor x1, x4, x4; + +#define S2_1(x0, x1, x2, x3, x4) \ + vpxor RNOT, x3, x3; \ + vpxor x0, x1, x1; \ + vpand x2, x0, tp; \ + vpxor x3, tp, tp; \ + vpor x0, x3, x3; \ + vpxor x1, x2, x2; \ + vpxor x1, x3, x3; \ + vpand tp, x1, x1; +#define S2_2(x0, x1, x2, x3, x4) \ + vpxor x2, tp, tp; \ + vpand x3, x2, x2; \ + vpor x1, x3, x3; \ + vpxor RNOT, tp, tp; \ + vpxor tp, x3, x3; \ + vpxor tp, x0, x4; \ + vpxor x2, tp, x0; \ + vpor x2, x1, x1; + +#define S3_1(x0, x1, x2, x3, x4) \ + vpxor x3, x1, tp; \ + vpor x0, x3, x3; \ + vpand x0, x1, x4; \ + vpxor x2, x0, x0; \ + vpxor tp, x2, x2; \ + vpand x3, tp, x1; \ + vpxor x3, x2, x2; \ + vpor x4, x0, x0; \ + vpxor x3, x4, x4; +#define S3_2(x0, x1, x2, x3, x4) \ + vpxor x0, x1, x1; \ + vpand x3, x0, x0; \ + vpand x4, x3, x3; \ + vpxor x2, x3, x3; \ + vpor x1, x4, x4; \ + vpand x1, x2, x2; \ + vpxor x3, x4, x4; \ + vpxor x3, x0, x0; \ + vpxor x2, x3, x3; + +#define S4_1(x0, x1, x2, x3, x4) \ + vpand x0, x3, tp; \ + vpxor x3, x0, x0; \ + vpxor x2, tp, tp; \ + vpor x3, x2, x2; \ + vpxor x1, x0, x0; \ + vpxor tp, x3, x4; \ + vpor x0, x2, x2; \ + vpxor x1, x2, x2; +#define S4_2(x0, x1, x2, x3, x4) \ + vpand x0, x1, x1; \ + vpxor x4, x1, x1; \ + vpand x2, x4, x4; \ + vpxor tp, x2, x2; \ + vpxor x0, x4, x4; \ + vpor x1, tp, x3; \ + vpxor RNOT, x1, x1; \ + vpxor x0, x3, x3; + +#define S5_1(x0, x1, x2, x3, x4) \ + vpor x0, x1, tp; \ + vpxor tp, x2, x2; \ + vpxor RNOT, x3, x3; \ + vpxor x0, x1, x4; \ + vpxor x2, x0, x0; \ + vpand x4, tp, x1; \ + vpor x3, x4, x4; \ + vpxor x0, x4, x4; +#define S5_2(x0, x1, x2, x3, x4) \ + vpand x3, x0, x0; \ + vpxor x3, x1, x1; \ + vpxor x2, x3, x3; \ + vpxor x1, x0, x0; \ + vpand x4, x2, x2; \ + vpxor x2, x1, x1; \ + vpand x0, x2, x2; \ + vpxor x2, x3, x3; + +#define S6_1(x0, x1, x2, x3, x4) \ + vpxor x0, x3, x3; \ + vpxor x2, x1, tp; \ + vpxor x0, x2, x2; \ + vpand x3, x0, x0; \ + vpor x3, tp, tp; \ + vpxor RNOT, x1, x4; \ + vpxor tp, x0, x0; \ + vpxor x2, tp, x1; +#define S6_2(x0, x1, x2, x3, x4) \ + vpxor x4, x3, x3; \ + vpxor x0, x4, x4; \ + vpand x0, x2, x2; \ + vpxor x1, x4, x4; \ + vpxor x3, x2, x2; \ + vpand x1, x3, x3; \ + vpxor x0, x3, x3; \ + vpxor x2, x1, x1; + +#define S7_1(x0, x1, x2, x3, x4) \ + vpxor RNOT, x1, tp; \ + vpxor RNOT, x0, x0; \ + vpand x2, tp, x1; \ + vpxor x3, x1, x1; \ + vpor tp, x3, x3; \ + vpxor x2, tp, x4; \ + vpxor x3, x2, x2; \ + vpxor x0, x3, x3; \ + vpor x1, x0, x0; +#define S7_2(x0, x1, x2, x3, x4) \ + vpand x0, x2, x2; \ + vpxor x4, x0, x0; \ + vpxor x3, x4, x4; \ + vpand x0, x3, x3; \ + vpxor x1, x4, x4; \ + vpxor x4, x2, x2; \ + vpxor x1, x3, x3; \ + vpor x0, x4, x4; \ + vpxor x1, x4, x4; + +#define SI0_1(x0, x1, x2, x3, x4) \ + vpxor x0, x1, x1; \ + vpor x1, x3, tp; \ + vpxor x1, x3, x4; \ + vpxor RNOT, x0, x0; \ + vpxor tp, x2, x2; \ + vpxor x0, tp, x3; \ + vpand x1, x0, x0; \ + vpxor x2, x0, x0; +#define SI0_2(x0, x1, x2, x3, x4) \ + vpand x3, x2, x2; \ + vpxor x4, x3, x3; \ + vpxor x3, x2, x2; \ + vpxor x3, x1, x1; \ + vpand x0, x3, x3; \ + vpxor x0, x1, x1; \ + vpxor x2, x0, x0; \ + vpxor x3, x4, x4; + +#define SI1_1(x0, x1, x2, x3, x4) \ + vpxor x3, x1, x1; \ + vpxor x2, x0, tp; \ + vpxor RNOT, x2, x2; \ + vpor x1, x0, x4; \ + vpxor x3, x4, x4; \ + vpand x1, x3, x3; \ + vpxor x2, x1, x1; \ + vpand x4, x2, x2; +#define SI1_2(x0, x1, x2, x3, x4) \ + vpxor x1, x4, x4; \ + vpor x3, x1, x1; \ + vpxor tp, x3, x3; \ + vpxor tp, x2, x2; \ + vpor x4, tp, x0; \ + vpxor x4, x2, x2; \ + vpxor x0, x1, x1; \ + vpxor x1, x4, x4; + +#define SI2_1(x0, x1, x2, x3, x4) \ + vpxor x1, x2, x2; \ + vpxor RNOT, x3, tp; \ + vpor x2, tp, tp; \ + vpxor x3, x2, x2; \ + vpxor x0, x3, x4; \ + vpxor x1, tp, x3; \ + vpor x2, x1, x1; \ + vpxor x0, x2, x2; +#define SI2_2(x0, x1, x2, x3, x4) \ + vpxor x4, x1, x1; \ + vpor x3, x4, x4; \ + vpxor x3, x2, x2; \ + vpxor x2, x4, x4; \ + vpand x1, x2, x2; \ + vpxor x3, x2, x2; \ + vpxor x4, x3, x3; \ + vpxor x0, x4, x4; + +#define SI3_1(x0, x1, x2, x3, x4) \ + vpxor x1, x2, x2; \ + vpand x2, x1, tp; \ + vpxor x0, tp, tp; \ + vpor x1, x0, x0; \ + vpxor x3, x1, x4; \ + vpxor x3, x0, x0; \ + vpor tp, x3, x3; \ + vpxor x2, tp, x1; +#define SI3_2(x0, x1, x2, x3, x4) \ + vpxor x3, x1, x1; \ + vpxor x2, x0, x0; \ + vpxor x3, x2, x2; \ + vpand x1, x3, x3; \ + vpxor x0, x1, x1; \ + vpand x2, x0, x0; \ + vpxor x3, x4, x4; \ + vpxor x0, x3, x3; \ + vpxor x1, x0, x0; + +#define SI4_1(x0, x1, x2, x3, x4) \ + vpxor x3, x2, x2; \ + vpand x1, x0, tp; \ + vpxor x2, tp, tp; \ + vpor x3, x2, x2; \ + vpxor RNOT, x0, x4; \ + vpxor tp, x1, x1; \ + vpxor x2, tp, x0; \ + vpand x4, x2, x2; +#define SI4_2(x0, x1, x2, x3, x4) \ + vpxor x0, x2, x2; \ + vpor x4, x0, x0; \ + vpxor x3, x0, x0; \ + vpand x2, x3, x3; \ + vpxor x3, x4, x4; \ + vpxor x1, x3, x3; \ + vpand x0, x1, x1; \ + vpxor x1, x4, x4; \ + vpxor x3, x0, x0; + +#define SI5_1(x0, x1, x2, x3, x4) \ + vpor x2, x1, tp; \ + vpxor x1, x2, x2; \ + vpxor x3, tp, tp; \ + vpand x1, x3, x3; \ + vpxor x3, x2, x2; \ + vpor x0, x3, x3; \ + vpxor RNOT, x0, x0; \ + vpxor x2, x3, x3; \ + vpor x0, x2, x2; +#define SI5_2(x0, x1, x2, x3, x4) \ + vpxor tp, x1, x4; \ + vpxor x4, x2, x2; \ + vpand x0, x4, x4; \ + vpxor tp, x0, x0; \ + vpxor x3, tp, x1; \ + vpand x2, x0, x0; \ + vpxor x3, x2, x2; \ + vpxor x2, x0, x0; \ + vpxor x4, x2, x2; \ + vpxor x3, x4, x4; + +#define SI6_1(x0, x1, x2, x3, x4) \ + vpxor x2, x0, x0; \ + vpand x3, x0, tp; \ + vpxor x3, x2, x2; \ + vpxor x2, tp, tp; \ + vpxor x1, x3, x3; \ + vpor x0, x2, x2; \ + vpxor x3, x2, x2; \ + vpand tp, x3, x3; +#define SI6_2(x0, x1, x2, x3, x4) \ + vpxor RNOT, tp, tp; \ + vpxor x1, x3, x3; \ + vpand x2, x1, x1; \ + vpxor tp, x0, x4; \ + vpxor x4, x3, x3; \ + vpxor x2, x4, x4; \ + vpxor x1, tp, x0; \ + vpxor x0, x2, x2; + +#define SI7_1(x0, x1, x2, x3, x4) \ + vpand x0, x3, tp; \ + vpxor x2, x0, x0; \ + vpor x3, x2, x2; \ + vpxor x1, x3, x4; \ + vpxor RNOT, x0, x0; \ + vpor tp, x1, x1; \ + vpxor x0, x4, x4; \ + vpand x2, x0, x0; \ + vpxor x1, x0, x0; +#define SI7_2(x0, x1, x2, x3, x4) \ + vpand x2, x1, x1; \ + vpxor x2, tp, x3; \ + vpxor x3, x4, x4; \ + vpand x3, x2, x2; \ + vpor x0, x3, x3; \ + vpxor x4, x1, x1; \ + vpxor x4, x3, x3; \ + vpand x0, x4, x4; \ + vpxor x2, x4, x4; + +#define get_key(i, j, t) \ + vbroadcastss (4*(i)+(j))*4(CTX), t; + +#define K2(x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + get_key(i, 1, RK1); \ + get_key(i, 2, RK2); \ + get_key(i, 3, RK3); \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; + +#define LK2(x0, x1, x2, x3, x4, i) \ + vpslld $13, x0 ## 1, x4 ## 1; \ + vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x2 ## 1, x4 ## 1; \ + vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $13, x0 ## 2, x4 ## 2; \ + vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x2 ## 2, x4 ## 2; \ + vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $1, x1 ## 1, x4 ## 1; \ + vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x0 ## 1, x4 ## 1; \ + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ + get_key(i, 1, RK1); \ + vpslld $1, x1 ## 2, x4 ## 2; \ + vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x0 ## 2, x4 ## 2; \ + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ + get_key(i, 3, RK3); \ + vpslld $7, x3 ## 1, x4 ## 1; \ + vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpslld $7, x1 ## 1, x4 ## 1; \ + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ + get_key(i, 0, RK0); \ + vpslld $7, x3 ## 2, x4 ## 2; \ + vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpslld $7, x1 ## 2, x4 ## 2; \ + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ + get_key(i, 2, RK2); \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpslld $5, x0 ## 1, x4 ## 1; \ + vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpslld $22, x2 ## 1, x4 ## 1; \ + vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; \ + vpslld $5, x0 ## 2, x4 ## 2; \ + vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpslld $22, x2 ## 2, x4 ## 2; \ + vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; + +#define KL2(x0, x1, x2, x3, x4, i) \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpsrld $5, x0 ## 1, x4 ## 1; \ + vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpsrld $22, x2 ## 1, x4 ## 1; \ + vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; \ + vpsrld $5, x0 ## 2, x4 ## 2; \ + vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpsrld $22, x2 ## 2, x4 ## 2; \ + vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ + vpslld $7, x1 ## 1, x4 ## 1; \ + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpsrld $1, x1 ## 1, x4 ## 1; \ + vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ + vpslld $7, x1 ## 2, x4 ## 2; \ + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpsrld $1, x1 ## 2, x4 ## 2; \ + vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ + vpsrld $7, x3 ## 1, x4 ## 1; \ + vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x0 ## 1, x4 ## 1; \ + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpsrld $7, x3 ## 2, x4 ## 2; \ + vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x0 ## 2, x4 ## 2; \ + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpsrld $13, x0 ## 1, x4 ## 1; \ + vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ + vpsrld $3, x2 ## 1, x4 ## 1; \ + vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpsrld $13, x0 ## 2, x4 ## 2; \ + vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ + vpsrld $3, x2 ## 2, x4 ## 2; \ + vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; + +#define S(SBOX, x0, x1, x2, x3, x4) \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); + +#define SP(SBOX, x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 2, RK2); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 3, RK3); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + get_key(i, 1, RK1); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x3; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; \ + vpunpcklqdq x3, t2, x2; \ + vpunpckhqdq x3, t2, x3; + +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + vmovdqu (0*4*4)(in), x0; \ + vmovdqu (1*4*4)(in), x1; \ + vmovdqu (2*4*4)(in), x2; \ + vmovdqu (3*4*4)(in), x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vmovdqu x0, (0*4*4)(out); \ + vmovdqu x1, (1*4*4)(out); \ + vmovdqu x2, (2*4*4)(out); \ + vmovdqu x3, (3*4*4)(out); + +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpxor (0*4*4)(out), x0, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor (1*4*4)(out), x1, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor (2*4*4)(out), x2, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor (3*4*4)(out), x3, x3; \ + vmovdqu x3, (3*4*4)(out); + +.align 8 +.global __serpent_enc_blk_8way +.type __serpent_enc_blk_8way,@function; + +__serpent_enc_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + vpcmpeqd RNOT, RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 0); + S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); + S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); + S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); + S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); + S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); + S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); + S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); + S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); + S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); + S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); + S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); + S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); + S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); + S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); + S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); + S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); + S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); + S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); + S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); + S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); + S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); + S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); + S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); + S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); + S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); + S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); + S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); + S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); + S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); + S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); + S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); + S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); + + leaq (4*4*4)(%rsi), %rax; + + testb %cl, %cl; + jnz __enc_xor8; + + write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +__enc_xor8: + xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +.align 8 +.global serpent_dec_blk_8way +.type serpent_dec_blk_8way,@function; + +serpent_dec_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vpcmpeqd RNOT, RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 32); + SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); + SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); + SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); + SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); + SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); + SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); + SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); + SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); + SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); + SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); + SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); + SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); + SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); + SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); + SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); + SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); + SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); + SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); + SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); + SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); + SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); + SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); + SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); + SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); + SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); + SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); + SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); + SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); + SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); + SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); + SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); + S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); + + leaq (4*4*4)(%rsi), %rax; + write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); + write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); + + ret; diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c new file mode 100644 index 00000000000..0dc7a26535e --- /dev/null +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -0,0 +1,949 @@ +/* + * Glue Code for AVX assembler versions of Serpent Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * Glue code based on serpent_sse2_glue.c by: + * Copyright (C) 2011 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct async_serpent_ctx { + struct cryptd_ablkcipher *cryptd_tfm; +}; + +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + if (fpu_enabled) + return true; + + /* AVX is only used when chunk to be processed is large enough, so + * do not enable FPU until it is necessary. + */ + if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS) + return false; + + kernel_fpu_begin(); + return true; +} + +static inline void serpent_fpu_end(bool fpu_enabled) +{ + if (fpu_enabled) + kernel_fpu_end(); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + bool enc) +{ + bool fpu_enabled = false; + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + if (enc) + serpent_enc_blk_xway(ctx, wdst, wsrc); + else + serpent_dec_blk_xway(ctx, wdst, wsrc); + + wsrc += bsize * SERPENT_PARALLEL_BLOCKS; + wdst += bsize * SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (enc) + __serpent_encrypt(ctx, wdst, wsrc); + else + __serpent_decrypt(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, true); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, false); +} + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + u128 last_iv; + int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1); + src -= SERPENT_PARALLEL_BLOCKS - 1; + dst -= SERPENT_PARALLEL_BLOCKS - 1; + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) + ivs[i] = src[i]; + + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) + u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes)) { + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + return err; +} + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *ctrblk = walk->iv; + u8 keystream[SERPENT_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + __serpent_encrypt(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, SERPENT_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + be128 ctrblocks[SERPENT_PARALLEL_BLOCKS]; + int i; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + /* create ctrblks for parallel encrypt */ + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblocks[i], &ctrblk); + u128_inc(&ctrblk); + } + + serpent_enc_blk_xway_xor(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += SERPENT_PARALLEL_BLOCKS; + dst += SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + + __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + u128_xor(dst, dst, (u128 *)ctrblocks); + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) { + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +struct crypt_priv { + struct serpent_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_decrypt(ctx->ctx, srcdst, srcdst); +} + +struct serpent_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct serpent_ctx serpent_ctx; +}; + +static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __serpent_setkey(&ctx->serpent_ctx, key, keylen - + SERPENT_BLOCK_SIZE); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - + SERPENT_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +struct serpent_xts_ctx { + struct serpent_ctx tweak_ctx; + struct serpent_ctx crypt_ctx; +}; + +static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; +} + +static int __ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->encrypt( + &desc, req->dst, req->src, req->nbytes); +} + +static int ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_encrypt(cryptd_req); + } else { + return __ablk_encrypt(req); + } +} + +static int ablk_decrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_decrypt(cryptd_req); + } else { + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->decrypt( + &desc, req->dst, req->src, req->nbytes); + } +} + +static void ablk_exit(struct crypto_tfm *tfm) +{ + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ablkcipher(ctx->cryptd_tfm); +} + +static int ablk_init(struct crypto_tfm *tfm) +{ + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); + struct cryptd_ablkcipher *cryptd_tfm; + char drv_name[CRYPTO_MAX_ALG_NAME]; + + snprintf(drv_name, sizeof(drv_name), "__driver-%s", + crypto_tfm_alg_driver_name(tfm)); + + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); + + return 0; +} + +static struct crypto_alg serpent_algs[10] = { { + .cra_name = "__ecb-serpent-avx", + .cra_driver_name = "__driver-ecb-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-serpent-avx", + .cra_driver_name = "__driver-cbc-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-serpent-avx", + .cra_driver_name = "__driver-ctr-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-serpent-avx", + .cra_driver_name = "__driver-lrw-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = lrw_serpent_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-serpent-avx", + .cra_driver_name = "__driver-xts-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(serpent)", + .cra_driver_name = "ecb-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(serpent)", + .cra_driver_name = "cbc-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(serpent)", + .cra_driver_name = "ctr-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(serpent)", + .cra_driver_name = "lrw-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(serpent)", + .cra_driver_name = "xts-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init serpent_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) { + printk(KERN_INFO "AVX instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + printk(KERN_INFO "AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); +} + +static void __exit serpent_exit(void) +{ + crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); +} + +module_init(serpent_init); +module_exit(serpent_exit); + +MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("serpent"); diff --git a/crypto/Kconfig b/crypto/Kconfig index e00a4e49e01..2c1c2dfcc02 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -821,6 +821,26 @@ config CRYPTO_SERPENT_SSE2_586 See also: +config CRYPTO_SERPENT_AVX_X86_64 + tristate "Serpent cipher algorithm (x86_64/AVX)" + depends on X86 && 64BIT + select CRYPTO_ALGAPI + select CRYPTO_CRYPTD + select CRYPTO_SERPENT + select CRYPTO_LRW + select CRYPTO_XTS + help + Serpent cipher algorithm, by Anderson, Biham & Knudsen. + + Keys are allowed to be from 0 to 256 bits in length, in steps + of 8 bits. + + This module provides the Serpent cipher algorithm that processes + eight blocks parallel using the AVX instruction set. + + See also: + + config CRYPTO_TEA tristate "TEA, XTEA and XETA cipher algorithms" select CRYPTO_ALGAPI diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 73b3ec6fe1a..36748a5996e 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1534,6 +1534,21 @@ static int alg_test_null(const struct alg_test_desc *desc, /* Please keep this list sorted by algorithm name. */ static const struct alg_test_desc alg_test_descs[] = { { + .alg = "__cbc-serpent-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } + }, { .alg = "__cbc-serpent-sse2", .test = alg_test_null, .suite = { @@ -1578,6 +1593,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__driver-cbc-serpent-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__driver-cbc-serpent-sse2", .test = alg_test_null, @@ -1623,6 +1653,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__driver-ecb-serpent-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__driver-ecb-serpent-sse2", .test = alg_test_null, @@ -1835,6 +1880,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "cryptd(__driver-ecb-serpent-avx)", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "cryptd(__driver-ecb-serpent-sse2)", .test = alg_test_null, -- cgit v1.2.3-70-g09d2 From 83452c6a43d06dfbc7f78b0eafe6664c95a3895c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sun, 3 Jun 2012 20:48:18 +0200 Subject: x86/PCI: move fixup hooks from __init to __devinit The fixups are executed once the pci-device is found which is during boot process so __init seems fine as long as the platform does not support hotplug. However it is possible to remove the PCI bus at run time and have it rediscovered again via "echo 1 > /sys/bus/pci/rescan" and this will call the fixups again. Cc: x86@kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Bjorn Helgaas --- arch/x86/kernel/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 03920a15a63..1b27de56356 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) /* Set correct numa_node information for AMD NB functions */ -static void __init quirk_amd_nb_node(struct pci_dev *dev) +static void __devinit quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; -- cgit v1.2.3-70-g09d2 From 2f74759056797054122cdc70844137f70bb3f626 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 7 Jun 2012 22:20:18 +0900 Subject: x86/alternatives: Use atomic_xchg() instead atomic_dec_and_test() for stop_machine_text_poke() stop_machine_text_poke() uses atomic_dec_and_test() to select one of the CPUs executing that function to actually modify the code. Since the variable is initialized to 1, subsequent CPUs will make the variable go negative. Since going negative is uncommon/unexpected in typical dec_and_test usage change this user to atomic_xchg(). This was found using a patch that warns on dec_and_test going negative. Signed-off-by: OGAWA Hirofumi Acked-by: Steven Rostedt [ Rewrote changelog ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/87zk8fgsx9.fsf@devron.myhome.or.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f075..53231a045d3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -664,7 +664,7 @@ static int __kprobes stop_machine_text_poke(void *data) struct text_poke_param *p; int i; - if (atomic_dec_and_test(&stop_machine_first)) { + if (atomic_xchg(&stop_machine_first, 0)) { for (i = 0; i < tpp->nparams; i++) { p = &tpp->params[i]; text_poke(p->addr, p->opcode, p->len); -- cgit v1.2.3-70-g09d2 From b918c62e086b2130a7bae44110ca516ef10bfe5a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 May 2012 18:51:11 -0700 Subject: PCI: replace struct pci_bus secondary/subordinate with busn_res Replace the struct pci_bus secondary/subordinate members with the struct resource busn_res. Later we'll build a resource tree of these bus numbers. [bhelgaas: changelog] Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/alpha/kernel/pci.c | 2 +- arch/arm/kernel/bios32.c | 2 +- arch/ia64/pci/pci.c | 2 +- arch/microblaze/pci/pci-common.c | 4 +-- arch/mips/pci/pci.c | 2 +- arch/powerpc/kernel/pci-common.c | 6 ++-- arch/powerpc/kernel/pci_64.c | 2 +- arch/powerpc/kernel/pci_of_scan.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 8 ++--- arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- arch/sh/drivers/pci/pci.c | 2 +- arch/sparc/kernel/pci.c | 6 ++-- arch/tile/kernel/pci.c | 4 +-- arch/x86/pci/acpi.c | 2 +- arch/xtensa/kernel/pci.c | 2 +- drivers/iommu/intel-iommu.c | 2 +- drivers/net/ethernet/broadcom/tg3.c | 4 +-- drivers/parisc/dino.c | 10 +++--- drivers/parisc/iosapic.c | 2 +- drivers/parisc/lba_pci.c | 22 ++++++------ drivers/pci/hotplug/acpiphp_glue.c | 8 ++--- drivers/pci/hotplug/cpci_hotplug_pci.c | 6 ++-- drivers/pci/hotplug/pciehp_pci.c | 4 +-- drivers/pci/hotplug/shpchp_pci.c | 6 ++-- drivers/pci/hotplug/shpchp_sysfs.c | 6 ++-- drivers/pci/iov.c | 4 +-- drivers/pci/pci.c | 2 +- drivers/pci/probe.c | 58 +++++++++++++++--------------- drivers/pci/setup-bus.c | 24 ++++++------- drivers/pcmcia/cardbus.c | 2 +- drivers/pcmcia/yenta_socket.c | 26 +++++++------- 31 files changed, 117 insertions(+), 117 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 1a629636cc1..53229a49631 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -359,7 +359,7 @@ common_init_pci(void) hose, &resources); hose->bus = bus; hose->need_domain_info = need_domain_info; - next_busno = bus->subordinate + 1; + next_busno = bus->busn_res.end + 1; /* Don't allow 8-bit bus number overflow inside the hose - reserve some space for bridges. */ if (next_busno > 224) { diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c index 25552508c3f..f07710849b5 100644 --- a/arch/arm/kernel/bios32.c +++ b/arch/arm/kernel/bios32.c @@ -461,7 +461,7 @@ static void __init pcibios_init_hw(struct hw_pci *hw, struct list_head *head) if (!sys->bus) panic("PCI: unable to scan bus!"); - busnr = sys->bus->subordinate + 1; + busnr = sys->bus->busn_res.end + 1; list_add(&sys->node, head); } else { diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 524df4295c9..3ca9bed7dc5 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -384,7 +384,7 @@ pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; } - pbus->subordinate = pci_scan_child_bus(pbus); + pbus->busn_res.end = pci_scan_child_bus(pbus); return pbus; out3: diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index ed22bfc5db1..9b32483cd0c 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -1506,10 +1506,10 @@ static void __devinit pcibios_scan_phb(struct pci_controller *hose) pci_free_resource_list(&resources); return; } - bus->secondary = hose->first_busno; + bus->busn_res.start = hose->first_busno; hose->bus = bus; - hose->last_busno = bus->subordinate; + hose->last_busno = bus->busn_res.end; } static int __init pcibios_init(void) diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c index 271e8c4a54c..0a9bf778edb 100644 --- a/arch/mips/pci/pci.c +++ b/arch/mips/pci/pci.c @@ -102,7 +102,7 @@ static void __devinit pcibios_scanbus(struct pci_controller *hose) need_domain_info = need_domain_info || hose->index; hose->need_domain_info = need_domain_info; if (bus) { - next_busno = bus->subordinate + 1; + next_busno = bus->busn_res.end + 1; /* Don't allow 8-bit bus number overflow inside the hose - reserve some space for bridges. */ if (next_busno > 224) { diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 8e78e93c818..3532b535698 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1646,7 +1646,7 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose) pci_free_resource_list(&resources); return; } - bus->secondary = hose->first_busno; + bus->busn_res.start = hose->first_busno; hose->bus = bus; /* Get probe mode and perform scan */ @@ -1655,12 +1655,12 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose) mode = ppc_md.pci_probe_mode(bus); pr_debug(" probe mode: %d\n", mode); if (mode == PCI_PROBE_DEVTREE) { - bus->subordinate = hose->last_busno; + bus->busn_res.end = hose->last_busno; of_scan_bus(node, bus); } if (mode == PCI_PROBE_NORMAL) - hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); + hose->last_busno = bus->busn_res.end = pci_scan_child_bus(bus); /* Platform gets a chance to do some global fixups before * we proceed to resource allocation diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 94a54f61d34..4ff190ff24a 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -236,7 +236,7 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus, for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) { bus = pci_bus_b(ln); - if (in_bus >= bus->number && in_bus <= bus->subordinate) + if (in_bus >= bus->number && in_bus <= bus->busn_res.end) break; bus = NULL; } diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 89dde171a6f..a36281aa98f 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -240,7 +240,7 @@ void __devinit of_scan_pci_bridge(struct pci_dev *dev) } bus->primary = dev->bus->number; - bus->subordinate = busrange[1]; + bus->busn_res.end = busrange[1]; bus->bridge_ctl = 0; /* parse ranges property */ diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fbdd74dac3a..9cda6a1ad0c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -589,7 +589,7 @@ static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb, dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; parent = pe->pbus->self; - count = pe->pbus->subordinate - pe->pbus->secondary + 1; + count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; switch(count) { case 1: bcomp = OpalPciBusAll; break; case 2: bcomp = OpalPciBus7Bits; break; @@ -816,11 +816,11 @@ static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, pe->pdev = NULL; pe->tce32_seg = -1; pe->mve_number = -1; - pe->rid = bus->secondary << 8; + pe->rid = bus->busn_res.start << 8; pe->dma_weight = 0; - pe_info(pe, "Secondary busses %d..%d associated with PE\n", - bus->secondary, bus->subordinate); + pe_info(pe, "Secondary busses %pR associated with PE\n", + &bus->busn_res); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 8b7bafa489c..3ccebc83dc0 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -121,7 +121,7 @@ void pcibios_add_pci_devices(struct pci_bus * bus) if (!num) return; pcibios_setup_bus_devices(bus); - max = bus->secondary; + max = bus->busn_res.start; for (pass=0; pass < 2; pass++) list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index 9d10a3cb879..43068dcb451 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -59,7 +59,7 @@ static void __devinit pcibios_scanbus(struct pci_channel *hose) need_domain_info = need_domain_info || hose->index; hose->need_domain_info = need_domain_info; if (bus) { - next_busno = bus->subordinate + 1; + next_busno = bus->busn_res.end + 1; /* Don't allow 8-bit bus number overflow inside the hose - reserve some space for bridges. */ if (next_busno > 224) { diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index fdaf2181167..c85bfd788f7 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -535,7 +535,7 @@ static void __devinit of_scan_pci_bridge(struct pci_pbm_info *pbm, } bus->primary = dev->bus->number; - bus->subordinate = busrange[1]; + bus->busn_res.end = busrange[1]; bus->bridge_ctl = 0; /* parse ranges property, or cook one up by hand for Simba */ @@ -693,8 +693,8 @@ struct pci_bus * __devinit pci_scan_one_pbm(struct pci_pbm_info *pbm, pci_free_resource_list(&resources); return NULL; } - bus->secondary = pbm->pci_first_busno; - bus->subordinate = pbm->pci_last_busno; + bus->busn_res.start = pbm->pci_first_busno; + bus->busn_res.end = pbm->pci_last_busno; pci_of_scan_bus(pbm, node, bus); pci_bus_add_devices(bus); diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c index b56d12bf590..54cc8d77c90 100644 --- a/arch/tile/kernel/pci.c +++ b/arch/tile/kernel/pci.c @@ -329,7 +329,7 @@ int __init pcibios_init(void) */ bus = pci_scan_bus(0, controller->ops, controller); controller->root_bus = bus; - controller->last_busno = bus->subordinate; + controller->last_busno = bus->busn_res.end; } } @@ -366,7 +366,7 @@ int __init pcibios_init(void) */ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && (PCI_SLOT(dev->devfn) == 0)) { - next_bus = dev->subordinate; + next_bus = dev->busn_res.end; controllers[i].mem_resources[0] = *next_bus->resource[0]; controllers[i].mem_resources[1] = diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index fc09c2754e0..350fe63c8a4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -440,7 +440,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); if (bus) { - bus->subordinate = pci_scan_child_bus(bus); + bus->busn_res.end = pci_scan_child_bus(bus); pci_set_host_bridge_release( to_pci_host_bridge(bus->bridge), release_pci_root_info, info); diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c index eb30e356f5b..9c57c1e6870 100644 --- a/arch/xtensa/kernel/pci.c +++ b/arch/xtensa/kernel/pci.c @@ -187,7 +187,7 @@ static int __init pcibios_init(void) bus = pci_scan_root_bus(NULL, pci_ctrl->first_busno, pci_ctrl->ops, pci_ctrl, &resources); pci_ctrl->bus = bus; - pci_ctrl->last_busno = bus->subordinate; + pci_ctrl->last_busno = bus->busn_res.end; if (next_busno <= pci_ctrl->last_busno) next_busno = pci_ctrl->last_busno+1; } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index b12af2ff8c5..2fb7d1598a6 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -661,7 +661,7 @@ static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn) if (drhd->devices[i] && drhd->devices[i]->subordinate && drhd->devices[i]->subordinate->number <= bus && - drhd->devices[i]->subordinate->subordinate >= bus) + drhd->devices[i]->subordinate->busn_res.end >= bus) return drhd->iommu; } diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index edeeb516807..09fa3c687a1 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -14168,7 +14168,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (bridge->subordinate && (bridge->subordinate->number <= tp->pdev->bus->number) && - (bridge->subordinate->subordinate >= + (bridge->subordinate->busn_res.end >= tp->pdev->bus->number)) { tg3_flag_set(tp, 5701_DMA_BUG); pci_dev_put(bridge); @@ -14196,7 +14196,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (bridge && bridge->subordinate && (bridge->subordinate->number <= tp->pdev->bus->number) && - (bridge->subordinate->subordinate >= + (bridge->subordinate->busn_res.end >= tp->pdev->bus->number)) { tg3_flag_set(tp, 40BIT_DMA_BUG); pci_dev_put(bridge); diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index 432d4bbcc62..70517b0f94e 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -174,7 +174,7 @@ static int dino_cfg_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { struct dino_device *d = DINO_DEV(parisc_walk_tree(bus->bridge)); - u32 local_bus = (bus->parent == NULL) ? 0 : bus->secondary; + u32 local_bus = (bus->parent == NULL) ? 0 : bus->busn_res.start; u32 v = DINO_CFG_TOK(local_bus, devfn, where & ~3); void __iomem *base_addr = d->hba.base_addr; unsigned long flags; @@ -209,7 +209,7 @@ static int dino_cfg_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { struct dino_device *d = DINO_DEV(parisc_walk_tree(bus->bridge)); - u32 local_bus = (bus->parent == NULL) ? 0 : bus->secondary; + u32 local_bus = (bus->parent == NULL) ? 0 : bus->busn_res.start; u32 v = DINO_CFG_TOK(local_bus, devfn, where & ~3); void __iomem *base_addr = d->hba.base_addr; unsigned long flags; @@ -554,7 +554,7 @@ dino_fixup_bus(struct pci_bus *bus) struct dino_device *dino_dev = DINO_DEV(parisc_walk_tree(bus->bridge)); DBG(KERN_WARNING "%s(0x%p) bus %d platform_data 0x%p\n", - __func__, bus, bus->secondary, + __func__, bus, bus->busn_res.start, bus->bridge->platform_data); /* Firmware doesn't set up card-mode dino, so we have to */ @@ -998,12 +998,12 @@ static int __init dino_probe(struct parisc_device *dev) return 0; } - bus->subordinate = pci_scan_child_bus(bus); + bus->busn_res.end = pci_scan_child_bus(bus); /* This code *depends* on scanning being single threaded * if it isn't, this global bus number count will fail */ - dino_current_bus = bus->subordinate + 1; + dino_current_bus = bus->busn_res.end + 1; pci_bus_assign_resources(bus); pci_bus_add_devices(bus); return 0; diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 1f9e9fefb8e..83380c8fcb6 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -532,7 +532,7 @@ iosapic_xlate_pin(struct iosapic_info *isi, struct pci_dev *pcidev) intr_slot = PCI_SLOT(pcidev->devfn); } DBG_IRT("iosapic_xlate_pin: bus %d slot %d pin %d\n", - pcidev->bus->secondary, intr_slot, intr_pin); + pcidev->bus->busn_res.start, intr_slot, intr_pin); return irt_find_irqline(isi, intr_slot, intr_pin); } diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c index 052fa230bc7..cd8f9ce8720 100644 --- a/drivers/parisc/lba_pci.c +++ b/drivers/parisc/lba_pci.c @@ -189,8 +189,8 @@ lba_dump_res(struct resource *r, int d) static int lba_device_present(u8 bus, u8 dfn, struct lba_device *d) { - u8 first_bus = d->hba.hba_bus->secondary; - u8 last_sub_bus = d->hba.hba_bus->subordinate; + u8 first_bus = d->hba.hba_bus->busn_res.start; + u8 last_sub_bus = d->hba.hba_bus->busn_res.end; if ((bus < first_bus) || (bus > last_sub_bus) || @@ -364,7 +364,7 @@ lba_rd_cfg(struct lba_device *d, u32 tok, u8 reg, u32 size) static int elroy_cfg_read(struct pci_bus *bus, unsigned int devfn, int pos, int size, u32 *data) { struct lba_device *d = LBA_DEV(parisc_walk_tree(bus->bridge)); - u32 local_bus = (bus->parent == NULL) ? 0 : bus->secondary; + u32 local_bus = (bus->parent == NULL) ? 0 : bus->busn_res.start; u32 tok = LBA_CFG_TOK(local_bus, devfn); void __iomem *data_reg = d->hba.base_addr + LBA_PCI_CFG_DATA; @@ -380,7 +380,7 @@ static int elroy_cfg_read(struct pci_bus *bus, unsigned int devfn, int pos, int return 0; } - if (LBA_SKIP_PROBE(d) && !lba_device_present(bus->secondary, devfn, d)) { + if (LBA_SKIP_PROBE(d) && !lba_device_present(bus->busn_res.start, devfn, d)) { DBG_CFG("%s(%x+%2x) -> -1 (b)\n", __func__, tok, pos); /* either don't want to look or know device isn't present. */ *data = ~0U; @@ -431,7 +431,7 @@ lba_wr_cfg(struct lba_device *d, u32 tok, u8 reg, u32 data, u32 size) static int elroy_cfg_write(struct pci_bus *bus, unsigned int devfn, int pos, int size, u32 data) { struct lba_device *d = LBA_DEV(parisc_walk_tree(bus->bridge)); - u32 local_bus = (bus->parent == NULL) ? 0 : bus->secondary; + u32 local_bus = (bus->parent == NULL) ? 0 : bus->busn_res.start; u32 tok = LBA_CFG_TOK(local_bus,devfn); if ((pos > 255) || (devfn > 255)) @@ -444,7 +444,7 @@ static int elroy_cfg_write(struct pci_bus *bus, unsigned int devfn, int pos, int return 0; } - if (LBA_SKIP_PROBE(d) && (!lba_device_present(bus->secondary, devfn, d))) { + if (LBA_SKIP_PROBE(d) && (!lba_device_present(bus->busn_res.start, devfn, d))) { DBG_CFG("%s(%x+%2x) = 0x%x (b)\n", __func__, tok, pos,data); return 1; /* New Workaround */ } @@ -481,7 +481,7 @@ static struct pci_ops elroy_cfg_ops = { static int mercury_cfg_read(struct pci_bus *bus, unsigned int devfn, int pos, int size, u32 *data) { struct lba_device *d = LBA_DEV(parisc_walk_tree(bus->bridge)); - u32 local_bus = (bus->parent == NULL) ? 0 : bus->secondary; + u32 local_bus = (bus->parent == NULL) ? 0 : bus->busn_res.start; u32 tok = LBA_CFG_TOK(local_bus, devfn); void __iomem *data_reg = d->hba.base_addr + LBA_PCI_CFG_DATA; @@ -514,7 +514,7 @@ static int mercury_cfg_write(struct pci_bus *bus, unsigned int devfn, int pos, i { struct lba_device *d = LBA_DEV(parisc_walk_tree(bus->bridge)); void __iomem *data_reg = d->hba.base_addr + LBA_PCI_CFG_DATA; - u32 local_bus = (bus->parent == NULL) ? 0 : bus->secondary; + u32 local_bus = (bus->parent == NULL) ? 0 : bus->busn_res.start; u32 tok = LBA_CFG_TOK(local_bus,devfn); if ((pos > 255) || (devfn > 255)) @@ -636,7 +636,7 @@ lba_fixup_bus(struct pci_bus *bus) struct lba_device *ldev = LBA_DEV(parisc_walk_tree(bus->bridge)); DBG("lba_fixup_bus(0x%p) bus %d platform_data 0x%p\n", - bus, bus->secondary, bus->bridge->platform_data); + bus, (int)bus->busn_res.start, bus->bridge->platform_data); /* ** Properly Setup MMIO resources for this bus. @@ -1511,7 +1511,7 @@ lba_driver_probe(struct parisc_device *dev) return 0; } - lba_bus->subordinate = pci_scan_child_bus(lba_bus); + lba_bus->busn_res.end = pci_scan_child_bus(lba_bus); /* This is in lieu of calling pci_assign_unassigned_resources() */ if (is_pdc_pat()) { @@ -1541,7 +1541,7 @@ lba_driver_probe(struct parisc_device *dev) lba_dev->flags |= LBA_FLAG_SKIP_PROBE; } - lba_next_bus = lba_bus->subordinate + 1; + lba_next_bus = lba_res->busn_res.end + 1; pci_bus_add_devices(lba_bus); /* Whew! Finally done! Tell services we got this one covered. */ diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 806c44fa645..62d0ae4dfca 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -100,11 +100,11 @@ static int post_dock_fixups(struct notifier_block *nb, unsigned long val, PCI_PRIMARY_BUS, &buses); - if (((buses >> 8) & 0xff) != bus->secondary) { + if (((buses >> 8) & 0xff) != bus->busn_res.start) { buses = (buses & 0xff000000) | ((unsigned int)(bus->primary) << 0) - | ((unsigned int)(bus->secondary) << 8) - | ((unsigned int)(bus->subordinate) << 16); + | ((unsigned int)(bus->busn_res.start) << 8) + | ((unsigned int)(bus->busn_res.end) << 16); pci_write_config_dword(bus->self, PCI_PRIMARY_BUS, buses); } return NOTIFY_OK; @@ -692,7 +692,7 @@ static unsigned char acpiphp_max_busnr(struct pci_bus *bus) * bus->subordinate value because it could have * padding in it. */ - max = bus->secondary; + max = bus->busn_res.start; list_for_each(tmp, &bus->children) { n = pci_bus_max_busnr(pci_bus_b(tmp)); diff --git a/drivers/pci/hotplug/cpci_hotplug_pci.c b/drivers/pci/hotplug/cpci_hotplug_pci.c index ae853ccd0cd..42f3a61db87 100644 --- a/drivers/pci/hotplug/cpci_hotplug_pci.c +++ b/drivers/pci/hotplug/cpci_hotplug_pci.c @@ -292,8 +292,8 @@ int __ref cpci_configure_slot(struct slot *slot) (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)) { /* Find an unused bus number for the new bridge */ struct pci_bus *child; - unsigned char busnr, start = parent->secondary; - unsigned char end = parent->subordinate; + unsigned char busnr, start = parent->busn_res.start; + unsigned char end = parent->busn_res.end; for (busnr = start; busnr <= end; busnr++) { if (!pci_find_bus(pci_domain_nr(parent), @@ -312,7 +312,7 @@ int __ref cpci_configure_slot(struct slot *slot) pci_dev_put(dev); continue; } - child->subordinate = pci_do_scan_bus(child); + child->busn_res.end = pci_do_scan_bus(child); pci_bus_size_bridges(child); } pci_dev_put(dev); diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c index 47d9dc06b10..b898f06b588 100644 --- a/drivers/pci/hotplug/pciehp_pci.c +++ b/drivers/pci/hotplug/pciehp_pci.c @@ -37,8 +37,8 @@ static int __ref pciehp_add_bridge(struct pci_dev *dev) { struct pci_bus *parent = dev->bus; - int pass, busnr, start = parent->secondary; - int end = parent->subordinate; + int pass, busnr, start = parent->busn_res.start; + int end = parent->busn_res.end; for (busnr = start; busnr <= end; busnr++) { if (!pci_find_bus(pci_domain_nr(parent), busnr)) diff --git a/drivers/pci/hotplug/shpchp_pci.c b/drivers/pci/hotplug/shpchp_pci.c index df7e4bfadae..d021eb031b3 100644 --- a/drivers/pci/hotplug/shpchp_pci.c +++ b/drivers/pci/hotplug/shpchp_pci.c @@ -64,8 +64,8 @@ int __ref shpchp_configure_device(struct slot *p_slot) (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)) { /* Find an unused bus number for the new bridge */ struct pci_bus *child; - unsigned char busnr, start = parent->secondary; - unsigned char end = parent->subordinate; + unsigned char busnr, start = parent->busn_res.start; + unsigned char end = parent->busn_res.end; for (busnr = start; busnr <= end; busnr++) { if (!pci_find_bus(pci_domain_nr(parent), busnr)) @@ -84,7 +84,7 @@ int __ref shpchp_configure_device(struct slot *p_slot) pci_dev_put(dev); continue; } - child->subordinate = pci_do_scan_bus(child); + child->busn_res.end = pci_do_scan_bus(child); pci_bus_size_bridges(child); } pci_configure_slot(dev); diff --git a/drivers/pci/hotplug/shpchp_sysfs.c b/drivers/pci/hotplug/shpchp_sysfs.c index efa30da1ae8..eeb23ceae4a 100644 --- a/drivers/pci/hotplug/shpchp_sysfs.c +++ b/drivers/pci/hotplug/shpchp_sysfs.c @@ -73,13 +73,13 @@ static ssize_t show_ctrl (struct device *dev, struct device_attribute *attr, cha } } out += sprintf(out, "Free resources: bus numbers\n"); - for (busnr = bus->secondary; busnr <= bus->subordinate; busnr++) { + for (busnr = bus->busn_res.start; busnr <= bus->busn_res.end; busnr++) { if (!pci_find_bus(pci_domain_nr(bus), busnr)) break; } - if (busnr < bus->subordinate) + if (busnr < bus->busn_res.end) out += sprintf(out, "start = %8.8x, length = %8.8x\n", - busnr, (bus->subordinate - busnr)); + busnr, (int)(bus->busn_res.end - busnr)); return out - buf; } diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 6554e1a0f63..e873060fb35 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -47,7 +47,7 @@ static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr) if (!child) return NULL; - child->subordinate = busnr; + child->busn_res.end = busnr; child->dev.parent = bus->bridge; rc = pci_bus_add_child(child); if (rc) { @@ -327,7 +327,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) iov->offset = offset; iov->stride = stride; - if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->subordinate) { + if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) { dev_err(&dev->dev, "SR-IOV: bus number out of range\n"); return -ENOMEM; } diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 447e83472c0..aeda6e9c245 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -110,7 +110,7 @@ unsigned char pci_bus_max_busnr(struct pci_bus* bus) struct list_head *tmp; unsigned char max, n; - max = bus->subordinate; + max = bus->busn_res.end; list_for_each(tmp, &bus->children) { n = pci_bus_max_busnr(pci_bus_b(tmp)); if(n > max) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 658ac977cb5..651b096134d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -381,8 +381,8 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) if (pci_is_root_bus(child)) /* It's a host bus, nothing to read */ return; - dev_info(&dev->dev, "PCI bridge to [bus %02x-%02x]%s\n", - child->secondary, child->subordinate, + dev_info(&dev->dev, "PCI bridge to %pR%s\n", + &child->busn_res, dev->transparent ? " (subtractive decode)" : ""); pci_bus_remove_resources(child); @@ -599,9 +599,9 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent, * Set up the primary, secondary and subordinate * bus numbers. */ - child->number = child->secondary = busnr; - child->primary = parent->secondary; - child->subordinate = 0xff; + child->number = child->busn_res.start = busnr; + child->primary = parent->busn_res.start; + child->busn_res.end = 0xff; if (!bridge) return child; @@ -643,8 +643,8 @@ static void pci_fixup_parent_subordinate_busnr(struct pci_bus *child, int max) if (!pcibios_assign_all_busses()) return; - while (parent->parent && parent->subordinate < max) { - parent->subordinate = max; + while (parent->parent && parent->busn_res.end < max) { + parent->busn_res.end = max; pci_write_config_byte(parent->self, PCI_SUBORDINATE_BUS, max); parent = parent->parent; } @@ -718,15 +718,15 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, if (!child) goto out; child->primary = primary; - child->subordinate = subordinate; + child->busn_res.end = subordinate; child->bridge_ctl = bctl; } cmax = pci_scan_child_bus(child); if (cmax > max) max = cmax; - if (child->subordinate > max) - max = child->subordinate; + if (child->busn_res.end > max) + max = child->busn_res.end; } else { /* * We need to assign a number to this bus which we always @@ -759,8 +759,8 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, } buses = (buses & 0xff000000) | ((unsigned int)(child->primary) << 0) - | ((unsigned int)(child->secondary) << 8) - | ((unsigned int)(child->subordinate) << 16); + | ((unsigned int)(child->busn_res.start) << 8) + | ((unsigned int)(child->busn_res.end) << 16); /* * yenta.c forces a secondary latency timer of 176. @@ -805,8 +805,8 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, break; while (parent->parent) { if ((!pcibios_assign_all_busses()) && - (parent->subordinate > max) && - (parent->subordinate <= max+i)) { + (parent->busn_res.end > max) && + (parent->busn_res.end <= max+i)) { j = 1; } parent = parent->parent; @@ -827,7 +827,7 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, /* * Set the subordinate bus number to its real value. */ - child->subordinate = max; + child->busn_res.end = max; pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max); } @@ -837,19 +837,19 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, /* Has only triggered on CardBus, fixup is in yenta_socket */ while (bus->parent) { - if ((child->subordinate > bus->subordinate) || - (child->number > bus->subordinate) || + if ((child->busn_res.end > bus->busn_res.end) || + (child->number > bus->busn_res.end) || (child->number < bus->number) || - (child->subordinate < bus->number)) { - dev_info(&child->dev, "[bus %02x-%02x] %s " - "hidden behind%s bridge %s [bus %02x-%02x]\n", - child->number, child->subordinate, - (bus->number > child->subordinate && - bus->subordinate < child->number) ? + (child->busn_res.end < bus->number)) { + dev_info(&child->dev, "%pR %s " + "hidden behind%s bridge %s %pR\n", + &child->busn_res, + (bus->number > child->busn_res.end && + bus->busn_res.end < child->number) ? "wholly" : "partially", bus->self->transparent ? " transparent" : "", dev_name(&bus->dev), - bus->number, bus->subordinate); + &bus->busn_res); } bus = bus->parent; } @@ -1548,7 +1548,7 @@ EXPORT_SYMBOL_GPL(pcie_bus_configure_settings); unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) { - unsigned int devfn, pass, max = bus->secondary; + unsigned int devfn, pass, max = bus->busn_res.start; struct pci_dev *dev; dev_dbg(&bus->dev, "scanning bus\n"); @@ -1642,7 +1642,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus, /* Create legacy_io and legacy_mem files for this bus */ pci_create_legacy_files(b); - b->number = b->secondary = bus; + b->number = b->busn_res.start = bus; if (parent) dev_info(parent, "PCI host bridge to bus %s\n", dev_name(&b->dev)); @@ -1693,7 +1693,7 @@ struct pci_bus * __devinit pci_scan_root_bus(struct device *parent, int bus, if (!b) return NULL; - b->subordinate = pci_scan_child_bus(b); + b->busn_res.end = pci_scan_child_bus(b); pci_bus_add_devices(b); return b; } @@ -1710,7 +1710,7 @@ struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent, pci_add_resource(&resources, &iomem_resource); b = pci_create_root_bus(parent, bus, ops, sysdata, &resources); if (b) - b->subordinate = pci_scan_child_bus(b); + b->busn_res.end = pci_scan_child_bus(b); else pci_free_resource_list(&resources); return b; @@ -1727,7 +1727,7 @@ struct pci_bus * __devinit pci_scan_bus(int bus, struct pci_ops *ops, pci_add_resource(&resources, &iomem_resource); b = pci_create_root_bus(NULL, bus, ops, sysdata, &resources); if (b) { - b->subordinate = pci_scan_child_bus(b); + b->busn_res.end = pci_scan_child_bus(b); pci_bus_add_devices(b); } else { pci_free_resource_list(&resources); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 8fa2d4be88d..192172c87b7 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -404,8 +404,8 @@ void pci_setup_cardbus(struct pci_bus *bus) struct resource *res; struct pci_bus_region region; - dev_info(&bridge->dev, "CardBus bridge to [bus %02x-%02x]\n", - bus->secondary, bus->subordinate); + dev_info(&bridge->dev, "CardBus bridge to %pR\n", + &bus->busn_res); res = bus->resource[0]; pcibios_resource_to_bus(bridge, ®ion, res); @@ -553,8 +553,8 @@ static void __pci_setup_bridge(struct pci_bus *bus, unsigned long type) { struct pci_dev *bridge = bus->self; - dev_info(&bridge->dev, "PCI bridge to [bus %02x-%02x]\n", - bus->secondary, bus->subordinate); + dev_info(&bridge->dev, "PCI bridge to %pR\n", + &bus->busn_res); if (type & IORESOURCE_IO) pci_setup_bridge_io(bus); @@ -745,8 +745,8 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, if (!size0 && !size1) { if (b_res->start || b_res->end) dev_info(&bus->self->dev, "disabling bridge window " - "%pR to [bus %02x-%02x] (unused)\n", b_res, - bus->secondary, bus->subordinate); + "%pR to %pR (unused)\n", b_res, + &bus->busn_res); b_res->flags = 0; return; } @@ -757,8 +757,8 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, if (size1 > size0 && realloc_head) { add_to_list(realloc_head, bus->self, b_res, size1-size0, 4096); dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window " - "%pR to [bus %02x-%02x] add_size %lx\n", b_res, - bus->secondary, bus->subordinate, size1-size0); + "%pR to %pR add_size %lx\n", b_res, + &bus->busn_res, size1-size0); } } @@ -863,8 +863,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, if (!size0 && !size1) { if (b_res->start || b_res->end) dev_info(&bus->self->dev, "disabling bridge window " - "%pR to [bus %02x-%02x] (unused)\n", b_res, - bus->secondary, bus->subordinate); + "%pR to %pR (unused)\n", b_res, + &bus->busn_res); b_res->flags = 0; return 1; } @@ -874,8 +874,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, if (size1 > size0 && realloc_head) { add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align); dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window " - "%pR to [bus %02x-%02x] add_size %llx\n", b_res, - bus->secondary, bus->subordinate, (unsigned long long)size1-size0); + "%pR to %pR add_size %llx\n", b_res, + &bus->busn_res, (unsigned long long)size1-size0); } return 1; } diff --git a/drivers/pcmcia/cardbus.c b/drivers/pcmcia/cardbus.c index 6e75153c5b4..24caeaf5052 100644 --- a/drivers/pcmcia/cardbus.c +++ b/drivers/pcmcia/cardbus.c @@ -73,7 +73,7 @@ int __ref cb_alloc(struct pcmcia_socket *s) s->functions = pci_scan_slot(bus, PCI_DEVFN(0, 0)); pci_fixup_cardbus(bus); - max = bus->secondary; + max = bus->busn_res.start; for (pass = 0; pass < 2; pass++) list_for_each_entry(dev, &bus->devices, bus_list) if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index d07f9ac8c41..667678db115 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -1048,8 +1048,8 @@ static void yenta_config_init(struct yenta_socket *socket) config_writeb(socket, PCI_LATENCY_TIMER, 168); config_writel(socket, PCI_PRIMARY_BUS, (176 << 24) | /* sec. latency timer */ - (dev->subordinate->subordinate << 16) | /* subordinate bus */ - (dev->subordinate->secondary << 8) | /* secondary bus */ + ((unsigned int)dev->subordinate->busn_res.end << 16) | /* subordinate bus */ + ((unsigned int)dev->subordinate->busn_res.start << 8) | /* secondary bus */ dev->subordinate->primary); /* primary bus */ /* @@ -1086,14 +1086,14 @@ static void yenta_fixup_parent_bridge(struct pci_bus *cardbus_bridge) struct pci_bus *bridge_to_fix = cardbus_bridge->parent; /* Check bus numbers are already set up correctly: */ - if (bridge_to_fix->subordinate >= cardbus_bridge->subordinate) + if (bridge_to_fix->busn_res.end >= cardbus_bridge->busn_res.end) return; /* The subordinate number is ok, nothing to do */ if (!bridge_to_fix->parent) return; /* Root bridges are ok */ /* stay within the limits of the bus range of the parent: */ - upper_limit = bridge_to_fix->parent->subordinate; + upper_limit = bridge_to_fix->parent->busn_res.end; /* check the bus ranges of all silbling bridges to prevent overlap */ list_for_each(tmp, &bridge_to_fix->parent->children) { @@ -1104,36 +1104,36 @@ static void yenta_fixup_parent_bridge(struct pci_bus *cardbus_bridge) * current upper limit, set the new upper limit to * the bus number below the silbling's range: */ - if (silbling->secondary > bridge_to_fix->subordinate - && silbling->secondary <= upper_limit) - upper_limit = silbling->secondary - 1; + if (silbling->busn_res.start > bridge_to_fix->busn_res.end + && silbling->busn_res.start <= upper_limit) + upper_limit = silbling->busn_res.start - 1; } /* Show that the wanted subordinate number is not possible: */ - if (cardbus_bridge->subordinate > upper_limit) + if (cardbus_bridge->busn_res.end > upper_limit) dev_printk(KERN_WARNING, &cardbus_bridge->dev, "Upper limit for fixing this " "bridge's parent bridge: #%02x\n", upper_limit); /* If we have room to increase the bridge's subordinate number, */ - if (bridge_to_fix->subordinate < upper_limit) { + if (bridge_to_fix->busn_res.end < upper_limit) { /* use the highest number of the hidden bus, within limits */ unsigned char subordinate_to_assign = - min(cardbus_bridge->subordinate, upper_limit); + min_t(int, cardbus_bridge->busn_res.end, upper_limit); dev_printk(KERN_INFO, &bridge_to_fix->dev, "Raising subordinate bus# of parent " "bus (#%02x) from #%02x to #%02x\n", bridge_to_fix->number, - bridge_to_fix->subordinate, subordinate_to_assign); + (int)bridge_to_fix->busn_res.end, subordinate_to_assign); /* Save the new subordinate in the bus struct of the bridge */ - bridge_to_fix->subordinate = subordinate_to_assign; + bridge_to_fix->busn_res.end = subordinate_to_assign; /* and update the PCI config space with the new subordinate */ pci_write_config_byte(bridge_to_fix->self, - PCI_SUBORDINATE_BUS, bridge_to_fix->subordinate); + PCI_SUBORDINATE_BUS, bridge_to_fix->busn_res.end); } } -- cgit v1.2.3-70-g09d2 From 5c1d81d160cc46e36fdd06702885c98c2643b4c5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 May 2012 18:51:12 -0700 Subject: x86/PCI: use _CRS bus number aperture for host bridges from ACPI Add the host bridge bus number aperture from _CRS to the resource list. Like the MMIO and I/O port apertures, this will be used when assigning resources to hot-added devices or in the case of conflicts. Note that we always use the _CRS bus number aperture, even if we're ignoring _CRS otherwise. [bhelgaas: changelog] Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 350fe63c8a4..2bb885afe10 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -12,7 +12,6 @@ struct pci_root_info { char name[16]; unsigned int res_num; struct resource *res; - int busnum; struct pci_sysdata sd; }; @@ -347,7 +346,9 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, { size_t size; + sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); info->bridge = device; + info->res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, info); @@ -360,8 +361,6 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, if (!info->res) return; - sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); - acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, info); } @@ -426,6 +425,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) } else { probe_pci_root_info(info, device, busnum, domain); + /* insert busn res at first */ + pci_add_resource(&resources, &root->secondary); /* * _CRS with no apertures is normal, so only fall back to * defaults or native bridge info if we're ignoring _CRS. @@ -440,7 +441,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); if (bus) { - bus->busn_res.end = pci_scan_child_bus(bus); + pci_scan_child_bus(bus); pci_set_host_bridge_release( to_pci_host_bridge(bus->bridge), release_pci_root_info, info); -- cgit v1.2.3-70-g09d2 From a10bb128b64fe03198c4930f4483dda55c5c84eb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 May 2012 18:51:12 -0700 Subject: x86/PCI: put busn resource in pci_root_info for native host bridge drivers Add the host bridge bus number aperture to the resource list. Like the MMIO and I/O port apertures, this will be used when assigning resources to hot-added devices or in the case of conflicts. [bhelgaas: changelog, tidy printk] Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/amd_bus.c | 7 +++---- arch/x86/pci/bus_numa.c | 22 +++++++++++++++++++--- arch/x86/pci/bus_numa.h | 3 +-- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 5aed49bff05..e9e6ed5cdf9 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -121,7 +121,6 @@ static int __init early_fill_mp_bus_info(void) link = (reg >> 8) & 0x03; info = alloc_pci_root_info(min_bus, max_bus, node, link); - sprintf(info->name, "PCI Bus #%02x", min_bus); } /* get the default node and link for left over res */ @@ -300,9 +299,9 @@ static int __init early_fill_mp_bus_info(void) int busnum; struct pci_root_res *root_res; - busnum = info->bus_min; - printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", - info->bus_min, info->bus_max, info->node, info->link); + busnum = info->busn.start; + printk(KERN_DEBUG "bus: %pR on node %x link %x\n", + &info->busn, info->node, info->link); list_for_each_entry(root_res, &info->resources, list) printk(KERN_DEBUG "bus: %02x %pR\n", busnum, &root_res->res); diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 306579f7d0f..d37e2fec97e 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -14,7 +14,7 @@ static struct pci_root_info *x86_find_pci_root_info(int bus) return NULL; list_for_each_entry(info, &pci_root_infos, list) - if (info->bus_min == bus) + if (info->busn.start == bus) return info; return NULL; @@ -24,6 +24,8 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources) { struct pci_root_info *info = x86_find_pci_root_info(bus); struct pci_root_res *root_res; + struct pci_host_bridge_window *window; + bool found = false; if (!info) goto default_resources; @@ -31,6 +33,16 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources) printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", bus); + /* already added by acpi ? */ + list_for_each_entry(window, resources, list) + if (window->res->flags & IORESOURCE_BUS) { + found = true; + break; + } + + if (!found) + pci_add_resource(resources, &info->busn); + list_for_each_entry(root_res, &info->resources, list) { struct resource *res; struct resource *root; @@ -66,9 +78,13 @@ struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max, if (!info) return info; + sprintf(info->name, "PCI Bus #%02x", bus_min); + INIT_LIST_HEAD(&info->resources); - info->bus_min = bus_min; - info->bus_max = bus_max; + info->busn.name = info->name; + info->busn.start = bus_min; + info->busn.end = bus_max; + info->busn.flags = IORESOURCE_BUS; info->node = node; info->link = link; diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index 226a466b2b2..ff8f65b0457 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -13,8 +13,7 @@ struct pci_root_info { struct list_head list; char name[12]; struct list_head resources; - int bus_min; - int bus_max; + struct resource busn; int node; int link; }; -- cgit v1.2.3-70-g09d2 From 00763e41138267a307531397f9745835aecb8c7b Mon Sep 17 00:00:00 2001 From: Xudong Hao Date: Thu, 7 Jun 2012 18:26:07 +0800 Subject: KVM: x86: change PT_FIRST_AVAIL_BITS_SHIFT to avoid conflict with EPT Dirty bit EPT Dirty bit use bit 9 as Intel SDM definition, to avoid conflict, change PT_FIRST_AVAIL_BITS_SHIFT to 10. Signed-off-by: Xudong Hao Signed-off-by: Xiantao Zhang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b32a11dc884..3b53d9e08bf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define PT64_LEVEL_BITS 9 -- cgit v1.2.3-70-g09d2 From 3387e7d69048f5ab02729825f9611754850d9a87 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 14 Jun 2012 10:09:03 +0800 Subject: crypto: serpent-sse2/avx - allow both to be built into kernel Rename serpent-avx assembler functions so that they do not collide with serpent-sse2 assembler functions when linking both versions in to same kernel image. Reported-by: Randy Dunlap Cc: Johannes Goetzfried Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 12 +++--- arch/x86/crypto/serpent_avx_glue.c | 2 +- arch/x86/crypto/serpent_sse2_glue.c | 2 +- arch/x86/include/asm/serpent-avx.h | 32 +++++++++++++++ arch/x86/include/asm/serpent-sse2.h | 63 +++++++++++++++++++++++++++++ arch/x86/include/asm/serpent.h | 63 ----------------------------- 6 files changed, 103 insertions(+), 71 deletions(-) create mode 100644 arch/x86/include/asm/serpent-avx.h create mode 100644 arch/x86/include/asm/serpent-sse2.h delete mode 100644 arch/x86/include/asm/serpent.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 0ed47a124ba..504106bf04a 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -579,10 +579,10 @@ vmovdqu x3, (3*4*4)(out); .align 8 -.global __serpent_enc_blk_8way -.type __serpent_enc_blk_8way,@function; +.global __serpent_enc_blk_8way_avx +.type __serpent_enc_blk_8way_avx,@function; -__serpent_enc_blk_8way: +__serpent_enc_blk_8way_avx: /* input: * %rdi: ctx, CTX * %rsi: dst @@ -647,10 +647,10 @@ __enc_xor8: ret; .align 8 -.global serpent_dec_blk_8way -.type serpent_dec_blk_8way,@function; +.global serpent_dec_blk_8way_avx +.type serpent_dec_blk_8way_avx,@function; -serpent_dec_blk_8way: +serpent_dec_blk_8way_avx: /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 0dc7a26535e..dd81bab4f11 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 4b21be85e0a..deecd25c129 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/serpent-avx.h b/arch/x86/include/asm/serpent-avx.h new file mode 100644 index 00000000000..432deedd294 --- /dev/null +++ b/arch/x86/include/asm/serpent-avx.h @@ -0,0 +1,32 @@ +#ifndef ASM_X86_SERPENT_AVX_H +#define ASM_X86_SERPENT_AVX_H + +#include +#include + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way_avx(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way_avx(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_8way_avx(ctx, dst, src); +} + +#endif diff --git a/arch/x86/include/asm/serpent-sse2.h b/arch/x86/include/asm/serpent-sse2.h new file mode 100644 index 00000000000..e6e77dffbda --- /dev/null +++ b/arch/x86/include/asm/serpent-sse2.h @@ -0,0 +1,63 @@ +#ifndef ASM_X86_SERPENT_SSE2_H +#define ASM_X86_SERPENT_SSE2_H + +#include +#include + +#ifdef CONFIG_X86_32 + +#define SERPENT_PARALLEL_BLOCKS 4 + +asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_4way(ctx, dst, src); +} + +#else + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_8way(ctx, dst, src); +} + +#endif + +#endif diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h deleted file mode 100644 index d3ef63fe0c8..00000000000 --- a/arch/x86/include/asm/serpent.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef ASM_X86_SERPENT_H -#define ASM_X86_SERPENT_H - -#include -#include - -#ifdef CONFIG_X86_32 - -#define SERPENT_PARALLEL_BLOCKS 4 - -asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_4way(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_4way(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - serpent_dec_blk_4way(ctx, dst, src); -} - -#else - -#define SERPENT_PARALLEL_BLOCKS 8 - -asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - serpent_dec_blk_8way(ctx, dst, src); -} - -#endif - -#endif -- cgit v1.2.3-70-g09d2 From c35f77417ebfc7c21c02aa9c8c30aa4cecf331d6 Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Mon, 11 Jun 2012 12:56:45 +0300 Subject: x86: Define early read-mostly per-cpu macros Some read-mostly per-cpu data may need to be declared or defined early, so it can be initialized and accessed before per_cpu areas are allocated. Only the data that resides in the per_cpu areas should be read-mostly, as there is little benefit in optimizing cache lines on initialization. Signed-off-by: Ido Yariv [ Added the missing declarations in !SMP code. ] Signed-off-by: Vlad Zolotarov Acked-by: Shai Fultheim Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/46188571.ddB8aVQYWo@vlad Signed-off-by: Ingo Molnar --- arch/x86/include/asm/percpu.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index d9b8e3f7f42..1104afaba52 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -551,6 +551,12 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off); { [0 ... NR_CPUS-1] = _initvalue }; \ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map +#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \ + DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue; \ + __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \ + { [0 ... NR_CPUS-1] = _initvalue }; \ + __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map + #define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ EXPORT_PER_CPU_SYMBOL(_name) @@ -559,6 +565,11 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off); extern __typeof__(_type) *_name##_early_ptr; \ extern __typeof__(_type) _name##_early_map[] +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ + DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ + extern __typeof__(_type) _name##_early_map[] + #define early_per_cpu_ptr(_name) (_name##_early_ptr) #define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) #define early_per_cpu(_name, _cpu) \ @@ -570,12 +581,18 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off); #define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ DEFINE_PER_CPU(_type, _name) = _initvalue +#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \ + DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue + #define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ EXPORT_PER_CPU_SYMBOL(_name) #define DECLARE_EARLY_PER_CPU(_type, _name) \ DECLARE_PER_CPU(_type, _name) +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ + DECLARE_PER_CPU_READ_MOSTLY(_type, _name) + #define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) #define early_per_cpu_ptr(_name) NULL /* no early_per_cpu_map() */ -- cgit v1.2.3-70-g09d2 From 0816b0f0365539c8f6280634d2c1778d0108d8f5 Mon Sep 17 00:00:00 2001 From: Vlad Zolotarov Date: Mon, 11 Jun 2012 12:56:52 +0300 Subject: x86: Add read_mostly declaration/definition to variables from smp.h Add "read-mostly" qualifier to the following variables in smp.h: - cpu_sibling_map - cpu_core_map - cpu_llc_shared_map - cpu_llc_id - cpu_number - x86_cpu_to_apicid - x86_bios_cpu_apicid - x86_cpu_to_logical_apicid As long as all the variables above are only written during the initialization, this change is meant to prevent the false sharing. More specifically, on vSMP Foundation platform x86_cpu_to_apicid shared the same internode_cache_line with frequently written lapic_events. From the analysis of the first 33 per_cpu variables out of 219 (memories they describe, to be more specific) the 8 have read_mostly nature (tlb_vector_offset, cpu_loops_per_jiffy, xen_debug_irq, etc.) and 25 are frequently written (irq_stack_union, gdt_page, exception_stacks, idt_desc, etc.). Assuming that the spread of the rest of the per_cpu variables is similar, identifying the read mostly memories will make more sense in terms of long-term code maintenance comparing to identifying frequently written memories. Signed-off-by: Vlad Zolotarov Acked-by: Shai Fultheim Cc: Shai Fultheim (Shai@ScaleMP.com) Cc: ido@wizery.com Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1719258.EYKzE4Zbq5@vlad Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/smp.h | 16 ++++++++-------- arch/x86/kernel/apic/apic.c | 6 +++--- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 8 ++++---- 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eaff4790ed9..a907d4d251a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -537,7 +537,7 @@ static inline const struct cpumask *default_target_cpus(void) #endif } -DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); static inline unsigned int read_apic_id(void) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f48394513c3..cc1df2b5cc6 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -31,12 +31,12 @@ static inline bool cpu_has_ht_siblings(void) return has_siblings; } -DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); -DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); /* cpus sharing the last level cache: */ -DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); -DECLARE_PER_CPU(u16, cpu_llc_id); -DECLARE_PER_CPU(int, cpu_number); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); static inline struct cpumask *cpu_sibling_mask(int cpu) { @@ -53,10 +53,10 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) return per_cpu(cpu_llc_shared_map, cpu); } -DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); -DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); #endif /* Static state in head.S used to set up a CPU */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e094a..0443b648221 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -75,8 +75,8 @@ physid_mask_t phys_cpu_present_map; /* * Map cpu index to physical APIC ID */ -DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); @@ -88,7 +88,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); * used for the mapping. This is where the behaviors of x86_64 and 32 * actually diverge. Let's keep it ugly for now. */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); /* * Knob to control our willingness to enable the local APIC. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5a98aa27218..5cdff035774 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -21,7 +21,7 @@ #include #include -DEFINE_PER_CPU(int, cpu_number); +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3fab55bea29..e61110e29a8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -104,17 +104,17 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; /* representing HT siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); -DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); -- cgit v1.2.3-70-g09d2 From cac4afbc3da58d9e5701b34bd4c1f11ea13328d4 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 12:39:34 +0200 Subject: x86/x2apic/cluster: Vector_allocation_domain() should return a value Since commit 8637e38 ("x86/apic: Avoid useless scanning thru a cpumask in assign_irq_vector()") vector_allocation_domain() operation indicates if a cpumask is dynamic or static. This update fixes the oversight and makes the operation to return a value. Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614103933.GJ3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 23a46cf5b6f..1885a73b7f3 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -228,10 +228,11 @@ static int x2apic_cluster_probe(void) /* * Each x2apic cluster is an allocation domain. */ -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); + return true; } static struct apic apic_x2apic_cluster = { -- cgit v1.2.3-70-g09d2 From a5a391561bc25898ba1a702a0c4b028aa5b11ce9 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:49:35 +0200 Subject: x86/apic: Eliminate cpu_mask_to_apicid() operation Since there are only two locations where cpu_mask_to_apicid() is called from, remove the operation and use only cpu_mask_to_apicid_and() instead. Signed-off-by: Alexander Gordeev Suggested-and-acked-by: Suresh Siddha Acked-by: Yinghai Lu Link: http://lkml.kernel.org/r/20120614074935.GE3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 33 ++++++++------------------------- arch/x86/kernel/apic/apic.c | 24 ++++++------------------ arch/x86/kernel/apic/apic_flat_64.c | 2 -- arch/x86/kernel/apic/apic_noop.c | 1 - arch/x86/kernel/apic/apic_numachip.c | 1 - arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/es7000_32.c | 4 +--- arch/x86/kernel/apic/io_apic.c | 3 ++- arch/x86/kernel/apic/numaq_32.c | 8 -------- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/summit_32.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 17 ----------------- arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 29 ++++++----------------------- arch/x86/platform/uv/uv_irq.c | 2 +- 15 files changed, 25 insertions(+), 105 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1ed3eead203..eec240e1209 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -331,8 +331,6 @@ struct apic { unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; - int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, - unsigned int *apicid); int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid); @@ -594,9 +592,15 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif static inline int -__flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) { - cpu_mask = cpu_mask & APIC_ALL_CPUS & cpumask_bits(cpu_online_mask)[0]; + unsigned long cpu_mask = cpumask_bits(cpumask)[0] & + cpumask_bits(andmask)[0] & + cpumask_bits(cpu_online_mask)[0] & + APIC_ALL_CPUS; + if (likely(cpu_mask)) { *apicid = (unsigned int)cpu_mask; return 0; @@ -605,27 +609,6 @@ __flat_cpu_mask_to_apicid(unsigned long cpu_mask, unsigned int *apicid) } } -static inline int -flat_cpu_mask_to_apicid(const struct cpumask *cpumask, - unsigned int *apicid) -{ - return __flat_cpu_mask_to_apicid(cpumask_bits(cpumask)[0], apicid); -} - -static inline int -flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - unsigned long mask1 = cpumask_bits(cpumask)[0]; - unsigned long mask2 = cpumask_bits(andmask)[0]; - return __flat_cpu_mask_to_apicid(mask1 & mask2, apicid); -} - -extern int -default_cpu_mask_to_apicid(const struct cpumask *cpumask, - unsigned int *apicid); - extern int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7e9bbe73bc5..048a4f806d4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,23 +2123,6 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static inline int __default_cpu_to_apicid(int cpu, unsigned int *apicid) -{ - if (likely((unsigned int)cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - return 0; - } else { - return -EINVAL; - } -} - -int default_cpu_mask_to_apicid(const struct cpumask *cpumask, - unsigned int *apicid) -{ - int cpu = cpumask_first_and(cpumask, cpu_online_mask); - return __default_cpu_to_apicid(cpu, apicid); -} - int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid) @@ -2151,7 +2134,12 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - return __default_cpu_to_apicid(cpu, apicid); + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + return 0; + } else { + return -EINVAL; + } } /* diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index bddc92566d0..00c77cf78e9 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -191,7 +191,6 @@ static struct apic apic_flat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = flat_send_IPI_mask, @@ -308,7 +307,6 @@ static struct apic apic_physflat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = physflat_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index ac9edf247b1..65c07fc630a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -159,7 +159,6 @@ struct apic apic_noop = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = noop_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c028132ad35..bc552cff257 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -234,7 +234,6 @@ static struct apic apic_numachip __refconst = { .set_apic_id = set_apic_id, .apic_id_mask = 0xffU << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = numachip_send_IPI_mask, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index df342fe4d6a..d50e3640d5a 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -188,7 +188,6 @@ static struct apic apic_bigsmp = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = bigsmp_send_IPI_mask, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index b35cfb9b696..2c5317ea1b8 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -525,7 +525,7 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid) return 1; } -static int +static inline int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; @@ -643,7 +643,6 @@ static struct apic __refdata apic_es7000_cluster = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, @@ -710,7 +709,6 @@ static struct apic __refdata apic_es7000 = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0deb773404e..0540f083f45 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1492,7 +1492,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, * We use logical delivery to get the timer IRQ * to the first CPU. */ - if (unlikely(apic->cpu_mask_to_apicid(apic->target_cpus(), &dest))) + if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), + apic->target_cpus(), &dest))) dest = BAD_APICID; entry.dest_mode = apic->irq_dest_mode; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 2b55514c328..d661ee95cab 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -406,13 +406,6 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static int -numaq_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) -{ - *apicid = 0x0F; - return 0; -} - static int numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, @@ -499,7 +492,6 @@ static struct apic __refdata apic_numaq = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, .send_IPI_mask = numaq_send_IPI_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 2c6f003b2e4..eef6bcd1bf1 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -108,7 +108,6 @@ static struct apic apic_default = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = default_send_IPI_mask_logical, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 79d360f6729..bbad180f289 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -263,7 +263,7 @@ static int summit_check_phys_apicid_present(int physical_apicid) return 1; } -static int +static inline int summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; @@ -516,7 +516,6 @@ static struct apic apic_summit = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = summit_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, .send_IPI_mask = summit_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 1885a73b7f3..943d03fc6fc 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -96,22 +96,6 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static int -x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) -{ - int cpu = cpumask_first_and(cpumask, cpu_online_mask); - int i; - - if (cpu >= nr_cpu_ids) - return -EINVAL; - - *apicid = 0; - for_each_cpu_and(i, cpumask, per_cpu(cpus_in_cluster, cpu)) - *apicid |= per_cpu(x86_cpu_to_logical_apicid, i); - - return 0; -} - static int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, @@ -270,7 +254,6 @@ static struct apic apic_x2apic_cluster = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f109388a0e8..e03a1e180e8 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -123,7 +123,6 @@ static struct apic apic_x2apic_phys = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 307aa076bd6..026de0114d1 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -269,27 +269,6 @@ static void uv_init_apic_ldr(void) { } -static inline int __uv_cpu_to_apicid(int cpu, unsigned int *apicid) -{ - if (likely((unsigned int)cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - return 0; - } else { - return -EINVAL; - } -} - -static int -uv_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first_and(cpumask, cpu_online_mask); - return __uv_cpu_to_apicid(cpu, apicid); -} - static int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, @@ -306,7 +285,12 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - return __uv_cpu_to_apicid(cpu, apicid); + if (likely((unsigned int)cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + return 0; + } else { + return -EINVAL; + } } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -384,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .send_IPI_mask = uv_send_IPI_mask, diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index dd1ff39a464..a67c7a6bac7 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -144,7 +144,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; - err = apic->cpu_mask_to_apicid(eligible_cpu, &dest); + err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); if (err != 0) return err; -- cgit v1.2.3-70-g09d2 From ea3807ea52a53f2cdfd60c89d8491fc9a8208d1c Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:49:55 +0200 Subject: x86/apic: Fix ugly casting and branching in cpu_mask_to_apicid_and() Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614074954.GF3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 8 ++++---- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 048a4f806d4..c421512ca5e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2127,19 +2127,19 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid) { - int cpu; + unsigned int cpu; for_each_cpu_and(cpu, cpumask, andmask) { if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (likely((unsigned int)cpu < nr_cpu_ids)) { + if (likely(cpu < nr_cpu_ids)) { *apicid = per_cpu(x86_cpu_to_apicid, cpu); return 0; - } else { - return -EINVAL; } + + return -EINVAL; } /* diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 2c5317ea1b8..effece2ea0d 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -529,7 +529,7 @@ static inline int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, uninitialized_var(apicid); + unsigned int cpu, uninitialized_var(apicid); /* * The cpus in the mask must all be on the apic cluster. diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index bbad180f289..b53fd6c9993 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -267,7 +267,7 @@ static inline int summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, apicid = 0; + unsigned int cpu, apicid = 0; /* * The cpus in the mask must all be on the apic cluster. diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 026de0114d1..8cfade9510a 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -274,7 +274,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid) { - int cpu; + int unsigned cpu; /* * We're using fixed IRQ delivery, can only return one phys APIC ID. @@ -285,12 +285,12 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (likely((unsigned int)cpu < nr_cpu_ids)) { + if (likely(cpu < nr_cpu_ids)) { *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; return 0; - } else { - return -EINVAL; } + + return -EINVAL; } static unsigned int x2apic_get_apic_id(unsigned long x) -- cgit v1.2.3-70-g09d2 From 49ad3fd4834182cce9725abb98e080b479fed464 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:50:11 +0200 Subject: x86/apic/es7000+summit: Fix compile warning in cpu_mask_to_apicid() Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614075010.GG3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index effece2ea0d..0c1347df3ad 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -554,8 +554,8 @@ es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask, unsigned int *apicid) { - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) return 0; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index b53fd6c9993..e6cc1829f7c 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -291,8 +291,8 @@ summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask, unsigned int *apicid) { - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) return 0; -- cgit v1.2.3-70-g09d2 From 214e270b5f5f6a85400a817d5305c797b2b7467a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:50:27 +0200 Subject: x86/apic/es7000+summit: Always make valid apicid from a cpumask In case of invalid parameters cpu_mask_to_apicid_and() might return apicid value of 0 (on Summit) or a uninitialized value (on ES7000), although it is supposed to return apicid of cpu-0 at least. Fix the operation to always return a valid apicid. Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614075026.GH3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 ++ arch/x86/kernel/apic/summit_32.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 0c1347df3ad..9882093f26e 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -545,6 +545,8 @@ es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) apicid = new_apicid; round++; } + if (!round) + return -EINVAL; *dest_id = apicid; return 0; } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index e6cc1829f7c..b6e61857c29 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -282,6 +282,8 @@ summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) apicid |= new_apicid; round++; } + if (!round) + return -EINVAL; *dest_id = apicid; return 0; } -- cgit v1.2.3-70-g09d2 From 5a0a2a308113086cc800a203d903271c9caa1611 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 14 Jun 2012 09:50:44 +0200 Subject: x86/apic/es7000: Make apicid of a cluster (not CPU) from a cpumask cpu_mask_to_apicid_and() always returns apicid of a single CPU, even in case multiple CPUs were requested. This update fixes a typo and forces apicid of a cluster to be returned. Signed-off-by: Alexander Gordeev Cc: Suresh Siddha Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20120614075043.GI3383@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 9882093f26e..0874799a98c 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -542,7 +542,7 @@ es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) return -EINVAL; } - apicid = new_apicid; + apicid |= new_apicid; round++; } if (!round) -- cgit v1.2.3-70-g09d2 From d48daf37a3d2e2b28a61e615c0fc538301edb0dd Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Thu, 14 Jun 2012 18:43:08 +0300 Subject: x86/vsmp: Fix linker error when CONFIG_PROC_FS is not set set_vsmp_pv_ops() references no_irq_affinity which is undeclared if CONFIG_PROC_FS isn't set. Fix this by adding an #ifdef around this variable's access. Reported-by: Fengguang Wu Signed-off-by: Ido Yariv Acked-by: Shai Fultheim Link: http://lkml.kernel.org/r/1339688588-12674-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 6b96a7374f9..3f0285ac00f 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -101,7 +101,10 @@ static void __init set_vsmp_pv_ops(void) #ifdef CONFIG_SMP if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); +#ifdef CONFIG_PROC_FS + /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; +#endif } #endif -- cgit v1.2.3-70-g09d2 From 7eb9ae0799b1e9f0b77733b432bc5f6f055b020b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 14 Jun 2012 18:28:49 -0700 Subject: irq/apic: Use config_enabled(CONFIG_SMP) checks to clean up irq_set_affinity() for UP Move the ->irq_set_affinity() routines out of the #ifdef CONFIG_SMP sections and use config_enabled(CONFIG_SMP) checks inside those routines. Thus making those routines simple null stubs for !CONFIG_SMP and retaining those routines with no additional runtime overhead for CONFIG_SMP kernels. Cleans up the ifdef CONFIG_SMP in and around routines related to irq_set_affinity in io_apic and irq_remapping subsystems. Signed-off-by: Suresh Siddha Cc: torvalds@linux-foundation.org Cc: joerg.roedel@amd.com Cc: Sam Ravnborg Cc: Paul Gortmaker Link: http://lkml.kernel.org/r/1339723729.3475.63.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 180 ++++++++++++++++-------------------- drivers/iommu/intel_irq_remapping.c | 7 +- drivers/iommu/irq_remapping.c | 5 +- drivers/iommu/irq_remapping.h | 2 - include/linux/irq.h | 2 - 5 files changed, 86 insertions(+), 110 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7cbd397884f..a951ef7decb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2224,81 +2224,6 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(cfg)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int irq = data->irq; - int err; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - cpumask_copy(data->affinity, mask); - - return 0; -} - -static int -ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = __ioapic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, data->chip_data); - ret = IRQ_SET_MASK_OK_NOCOPY; - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2386,6 +2311,87 @@ void irq_force_complete_move(int irq) static inline void irq_complete_move(struct irq_cfg *cfg) { } #endif +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(cfg)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + } +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int irq = data->irq; + int err; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + cpumask_copy(data->affinity, mask); + + return 0; +} + +static int +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + unsigned int dest, irq = data->irq; + unsigned long flags; + int ret; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ret = __ioapic_set_affinity(data, mask, &dest); + if (!ret) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, data->chip_data); + ret = IRQ_SET_MASK_OK_NOCOPY; + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return ret; +} + static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); @@ -2565,9 +2571,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_ack = ir_ack_apic_edge; chip->irq_eoi = ir_ack_apic_level; -#ifdef CONFIG_SMP chip->irq_set_affinity = set_remapped_irq_affinity; -#endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2578,9 +2582,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_unmask = unmask_ioapic_irq, .irq_ack = ack_apic_edge, .irq_eoi = ack_apic_level, -#ifdef CONFIG_SMP .irq_set_affinity = ioapic_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3099,7 +3101,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, return err; } -#ifdef CONFIG_SMP static int msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3121,7 +3122,6 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, @@ -3132,9 +3132,7 @@ static struct irq_chip msi_chip = { .irq_unmask = unmask_msi_irq, .irq_mask = mask_msi_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3219,7 +3217,6 @@ void native_teardown_msi_irq(unsigned int irq) } #ifdef CONFIG_DMAR_TABLE -#ifdef CONFIG_SMP static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) @@ -3244,16 +3241,12 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = dmar_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3274,7 +3267,6 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER -#ifdef CONFIG_SMP static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3297,16 +3289,12 @@ static int hpet_msi_set_affinity(struct irq_data *data, return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = hpet_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3341,8 +3329,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) */ #ifdef CONFIG_HT_IRQ -#ifdef CONFIG_SMP - static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) { struct ht_irq_msg msg; @@ -3370,16 +3356,12 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return IRQ_SET_MASK_OK_NOCOPY; } -#endif - static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = ht_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 853902a1b7d..e0b18f3ae9a 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -902,7 +902,6 @@ static int intel_setup_ioapic_entry(int irq, return 0; } -#ifdef CONFIG_SMP /* * Migrate the IO-APIC irq in the presence of intr-remapping. * @@ -926,6 +925,9 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irte irte; int err; + if (!config_enabled(CONFIG_SMP)) + return -EINVAL; + if (!cpumask_intersects(mask, cpu_online_mask)) return -EINVAL; @@ -963,7 +965,6 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, cpumask_copy(data->affinity, mask); return 0; } -#endif static void intel_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, @@ -1065,9 +1066,7 @@ struct irq_remap_ops intel_irq_remap_ops = { .reenable = reenable_irq_remapping, .enable_faulting = enable_drhd_fault_handling, .setup_ioapic_entry = intel_setup_ioapic_entry, -#ifdef CONFIG_SMP .set_affinity = intel_ioapic_set_affinity, -#endif .free_irq = free_irte, .compose_msi_msg = intel_compose_msi_msg, .msi_alloc_irq = intel_msi_alloc_irq, diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 40cda8e98d8..1d29b1c66e7 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -111,16 +111,15 @@ int setup_ioapic_remapped_entry(int irq, vector, attr); } -#ifdef CONFIG_SMP int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - if (!remap_ops || !remap_ops->set_affinity) + if (!config_enabled(CONFIG_SMP) || !remap_ops || + !remap_ops->set_affinity) return 0; return remap_ops->set_affinity(data, mask, force); } -#endif void free_remapped_irq(int irq) { diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index be9d72950c5..b12974cc1df 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -59,11 +59,9 @@ struct irq_remap_ops { unsigned int, int, struct io_apic_irq_attr *); -#ifdef CONFIG_SMP /* Set the CPU affinity of a remapped interrupt */ int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, bool force); -#endif /* Free an IRQ */ int (*free_irq)(int); diff --git a/include/linux/irq.h b/include/linux/irq.h index 61f5cec031e..47a937cd84a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -150,9 +150,7 @@ struct irq_data { void *handler_data; void *chip_data; struct msi_desc *msi_desc; -#ifdef CONFIG_SMP cpumask_var_t affinity; -#endif }; /* -- cgit v1.2.3-70-g09d2 From 650513979a437c32d7a0a84f0ed952a55bbb5583 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 16 Jun 2012 21:47:37 -0700 Subject: x86-64, reboot: Allow reboot=bios and reboot-cpu override on x86-64 With the revamped realmode trampoline code, it is trivial to extend support for reboot=bios to x86-64. Furthermore, while we are at it, remove the restriction that only we can only override the reboot CPU on 32 bits. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-jopx7y6g6dbcx4tpal8q0jlr@git.kernel.org --- arch/x86/include/asm/emergency-restart.h | 2 - arch/x86/include/asm/realmode.h | 3 +- arch/x86/include/asm/reboot.h | 4 +- arch/x86/kernel/reboot.c | 52 +++++------ arch/x86/realmode/rm/Makefile | 2 +- arch/x86/realmode/rm/header.S | 4 +- arch/x86/realmode/rm/reboot.S | 152 +++++++++++++++++++++++++++++++ arch/x86/realmode/rm/reboot_32.S | 132 --------------------------- 8 files changed, 184 insertions(+), 167 deletions(-) create mode 100644 arch/x86/realmode/rm/reboot.S delete mode 100644 arch/x86/realmode/rm/reboot_32.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index cc70c1c78ca..75ce3f47d20 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -4,9 +4,7 @@ enum reboot_type { BOOT_TRIPLE = 't', BOOT_KBD = 'k', -#ifdef CONFIG_X86_32 BOOT_BIOS = 'b', -#endif BOOT_ACPI = 'a', BOOT_EFI = 'e', BOOT_CF9 = 'p', diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fce3f4ae5bd..fe1ec5bcd84 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -21,8 +21,9 @@ struct real_mode_header { u32 wakeup_header; #endif /* APM/BIOS reboot */ -#ifdef CONFIG_X86_32 u32 machine_real_restart_asm; +#ifdef CONFIG_X86_64 + u32 machine_real_restart_seg; #endif }; diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 92f297069e8..a82c4f1b4d8 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -18,8 +18,8 @@ extern struct machine_ops machine_ops; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); -void machine_real_restart(unsigned int type); -/* These must match dispatch_table in reboot_32.S */ +void __noreturn machine_real_restart(unsigned int type); +/* These must match dispatch in arch/x86/realmore/rm/reboot.S */ #define MRR_BIOS 0 #define MRR_APM 1 diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 25b48edb847..6ddb9cd0ced 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -20,14 +20,12 @@ #include #include #include +#include -#ifdef CONFIG_X86_32 -# include -# include -# include -#else -# include -#endif +#include +#include +#include +#include /* * Power off function, if any @@ -49,7 +47,7 @@ int reboot_force; */ static int reboot_default = 1; -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP static int reboot_cpu = -1; #endif @@ -67,8 +65,8 @@ bool port_cf9_safe = false; * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] * warm Don't set the cold reboot flag * cold Set the cold reboot flag - * bios Reboot by jumping through the BIOS (only for X86_32) - * smp Reboot by executing reset on BSP or other CPU (only for X86_32) + * bios Reboot by jumping through the BIOS + * smp Reboot by executing reset on BSP or other CPU * triple Force a triple fault (init) * kbd Use the keyboard controller. cold reset (default) * acpi Use the RESET_REG in the FADT @@ -95,7 +93,6 @@ static int __init reboot_setup(char *str) reboot_mode = 0; break; -#ifdef CONFIG_X86_32 #ifdef CONFIG_SMP case 's': if (isdigit(*(str+1))) { @@ -112,7 +109,6 @@ static int __init reboot_setup(char *str) #endif /* CONFIG_SMP */ case 'b': -#endif case 'a': case 'k': case 't': @@ -138,7 +134,6 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); -#ifdef CONFIG_X86_32 /* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) @@ -157,11 +152,8 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } -void machine_real_restart(unsigned int type) +void __noreturn machine_real_restart(unsigned int type) { - void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) - real_mode_header->machine_real_restart_asm; - local_irq_disable(); /* @@ -181,7 +173,11 @@ void machine_real_restart(unsigned int type) /* * Switch back to the initial page table. */ +#ifdef CONFIG_X86_32 load_cr3(initial_page_table); +#else + write_cr3(real_mode_header->trampoline_pgd); +#endif /* * Write 0x1234 to absolute memory location 0x472. The BIOS reads @@ -192,14 +188,21 @@ void machine_real_restart(unsigned int type) *((unsigned short *)0x472) = reboot_mode; /* Jump to the identity-mapped low memory code */ - restart_lowmem(type); +#ifdef CONFIG_X86_32 + asm volatile("jmpl *%0" : : + "rm" (real_mode_header->machine_real_restart_asm), + "a" (type)); +#else + asm volatile("ljmpl *%0" : : + "m" (real_mode_header->machine_real_restart_asm), + "D" (type)); +#endif + unreachable(); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif -#endif /* CONFIG_X86_32 */ - /* * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot */ @@ -223,11 +226,9 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) } /* - * This is a single dmi_table handling all reboot quirks. Note that - * REBOOT_BIOS is only available for 32bit + * This is a single dmi_table handling all reboot quirks. */ static struct dmi_system_id __initdata reboot_dmi_table[] = { -#ifdef CONFIG_X86_32 { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, .ident = "Dell E520", @@ -377,7 +378,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, -#endif /* CONFIG_X86_32 */ { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, @@ -576,13 +576,11 @@ static void native_machine_emergency_restart(void) reboot_type = BOOT_KBD; break; -#ifdef CONFIG_X86_32 case BOOT_BIOS: machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; -#endif case BOOT_ACPI: acpi_reboot(); @@ -624,12 +622,10 @@ void native_machine_shutdown(void) /* The boot cpu is always logical cpu 0 */ int reboot_cpu_id = 0; -#ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && cpu_online(reboot_cpu)) reboot_cpu_id = reboot_cpu; -#endif /* Make certain the cpu I'm about to reboot on is online */ if (!cpu_online(reboot_cpu_id)) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 5b84a2d3088..b2d534cab25 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -22,7 +22,7 @@ wakeup-objs += video-bios.o realmode-y += header.o realmode-y += trampoline_$(BITS).o realmode-y += stack.o -realmode-$(CONFIG_X86_32) += reboot_32.o +realmode-y += reboot.o realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) targets += $(realmode-y) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index fadf48378ad..a28221d94e6 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -6,6 +6,7 @@ #include #include +#include #include "realmode.h" @@ -28,8 +29,9 @@ GLOBAL(real_mode_header) .long pa_wakeup_header #endif /* APM/BIOS reboot */ -#ifdef CONFIG_X86_32 .long pa_machine_real_restart_asm +#ifdef CONFIG_X86_64 + .long __KERNEL32_CS #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S new file mode 100644 index 00000000000..6bf8feac555 --- /dev/null +++ b/arch/x86/realmode/rm/reboot.S @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include "realmode.h" + +/* + * The following code and data reboots the machine by switching to real + * mode and jumping to the BIOS reset entry point, as if the CPU has + * really been reset. The previous version asked the keyboard + * controller to pulse the CPU reset line, which is more thorough, but + * doesn't work with at least one type of 486 motherboard. It is easy + * to stop this code working; hence the copious comments. + * + * This code is called with the restart type (0 = BIOS, 1 = APM) in + * the primary argument register (%eax for 32 bit, %edi for 64 bit). + */ + .section ".text32", "ax" + .code32 +ENTRY(machine_real_restart_asm) + +#ifdef CONFIG_X86_64 + + /* Disable paging to drop us out of long mode */ + movl %cr0, %eax + andl $~X86_CR0_PG, %eax + movl %eax, %cr0 + jmp 1f /* "A branch" may be needed here, assume near is OK */ + +1: + xorl %eax, %eax + xorl %edx, %edx + movl $MSR_EFER, %ecx + wrmsr + + movl %edi, %eax + +#endif /* CONFIG_X86_64 */ + + /* Set up the IDT for real mode. */ + lidtl pa_machine_real_restart_idt + + /* + * Set up a GDT from which we can load segment descriptors for real + * mode. The GDT is not used in real mode; it is just needed here to + * prepare the descriptors. + */ + lgdtl pa_machine_real_restart_gdt + + /* + * Load the data segment registers with 16-bit compatible values + */ + movl $16, %ecx + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + ljmpw $8, $1f + +/* + * This is 16-bit protected mode code to disable paging and the cache, + * switch to real mode and jump to the BIOS reset code. + * + * The instruction that switches to real mode by writing to CR0 must be + * followed immediately by a far jump instruction, which set CS to a + * valid value for real mode, and flushes the prefetch queue to avoid + * running instructions that have already been decoded in protected + * mode. + * + * Clears all the flags except ET, especially PG (paging), PE + * (protected-mode enable) and TS (task switch for coprocessor state + * save). Flushes the TLB after paging has been disabled. Sets CD and + * NW, to disable the cache on a 486, and invalidates the cache. This + * is more like the state of a 486 after reset. I don't know if + * something else should be done for other chips. + * + * More could be done here to set up the registers as if a CPU reset had + * occurred; hopefully real BIOSs don't assume much. This is not the + * actual BIOS entry point, anyway (that is at 0xfffffff0). + * + * Most of this work is probably excessive, but it is what is tested. + */ + .text + .code16 + + .balign 16 +machine_real_restart_asm16: +1: + xorl %ecx, %ecx + movl %cr0, %edx + andl $0x00000011, %edx + orl $0x60000000, %edx + movl %edx, %cr0 + movl %ecx, %cr3 + movl %cr0, %edx + testl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + jz 2f + wbinvd +2: + andb $0x10, %dl + movl %edx, %cr0 + LJMPW_RM(3f) +3: + andw %ax, %ax + jz bios + +apm: + movw $0x1000, %ax + movw %ax, %ss + movw $0xf000, %sp + movw $0x5307, %ax + movw $0x0001, %bx + movw $0x0003, %cx + int $0x15 + /* This should never return... */ + +bios: + ljmpw $0xf000, $0xfff0 + + .section ".rodata", "a" + + .balign 16 +GLOBAL(machine_real_restart_idt) + .word 0xffff /* Length - real mode default value */ + .long 0 /* Base - real mode default value */ +END(machine_real_restart_idt) + + .balign 16 +GLOBAL(machine_real_restart_gdt) + /* Self-pointer */ + .word 0xffff /* Length - real mode default value */ + .long pa_machine_real_restart_gdt + .word 0 + + /* + * 16-bit code segment pointing to real_mode_seg + * Selector value 8 + */ + .word 0xffff /* Limit */ + .long 0x9b000000 + pa_real_mode_base + .word 0 + + /* + * 16-bit data segment with the selector value 16 = 0x10 and + * base value 0x100; since this is consistent with real mode + * semantics we don't have to reload the segments once CR0.PE = 0. + */ + .quad GDT_ENTRY(0x0093, 0x100, 0xffff) +END(machine_real_restart_gdt) diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S deleted file mode 100644 index 114044876b3..00000000000 --- a/arch/x86/realmode/rm/reboot_32.S +++ /dev/null @@ -1,132 +0,0 @@ -#include -#include -#include -#include -#include "realmode.h" - -/* - * The following code and data reboots the machine by switching to real - * mode and jumping to the BIOS reset entry point, as if the CPU has - * really been reset. The previous version asked the keyboard - * controller to pulse the CPU reset line, which is more thorough, but - * doesn't work with at least one type of 486 motherboard. It is easy - * to stop this code working; hence the copious comments. - * - * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. - */ - .section ".text32", "ax" - .code32 - - .balign 16 -ENTRY(machine_real_restart_asm) - /* Set up the IDT for real mode. */ - lidtl pa_machine_real_restart_idt - - /* - * Set up a GDT from which we can load segment descriptors for real - * mode. The GDT is not used in real mode; it is just needed here to - * prepare the descriptors. - */ - lgdtl pa_machine_real_restart_gdt - - /* - * Load the data segment registers with 16-bit compatible values - */ - movl $16, %ecx - movl %ecx, %ds - movl %ecx, %es - movl %ecx, %fs - movl %ecx, %gs - movl %ecx, %ss - ljmpw $8, $1f - -/* - * This is 16-bit protected mode code to disable paging and the cache, - * switch to real mode and jump to the BIOS reset code. - * - * The instruction that switches to real mode by writing to CR0 must be - * followed immediately by a far jump instruction, which set CS to a - * valid value for real mode, and flushes the prefetch queue to avoid - * running instructions that have already been decoded in protected - * mode. - * - * Clears all the flags except ET, especially PG (paging), PE - * (protected-mode enable) and TS (task switch for coprocessor state - * save). Flushes the TLB after paging has been disabled. Sets CD and - * NW, to disable the cache on a 486, and invalidates the cache. This - * is more like the state of a 486 after reset. I don't know if - * something else should be done for other chips. - * - * More could be done here to set up the registers as if a CPU reset had - * occurred; hopefully real BIOSs don't assume much. This is not the - * actual BIOS entry point, anyway (that is at 0xfffffff0). - * - * Most of this work is probably excessive, but it is what is tested. - */ - .text - .code16 - - .balign 16 -machine_real_restart_asm16: -1: - xorl %ecx, %ecx - movl %cr0, %edx - andl $0x00000011, %edx - orl $0x60000000, %edx - movl %edx, %cr0 - movl %ecx, %cr3 - movl %cr0, %edx - testl $0x60000000, %edx /* If no cache bits -> no wbinvd */ - jz 2f - wbinvd -2: - andb $0x10, %dl - movl %edx, %cr0 - LJMPW_RM(3f) -3: - andw %ax, %ax - jz bios - -apm: - movw $0x1000, %ax - movw %ax, %ss - movw $0xf000, %sp - movw $0x5307, %ax - movw $0x0001, %bx - movw $0x0003, %cx - int $0x15 - /* This should never return... */ - -bios: - ljmpw $0xf000, $0xfff0 - - .section ".rodata", "a" - - .balign 16 -GLOBAL(machine_real_restart_idt) - .word 0xffff /* Length - real mode default value */ - .long 0 /* Base - real mode default value */ -END(machine_real_restart_idt) - - .balign 16 -GLOBAL(machine_real_restart_gdt) - /* Self-pointer */ - .word 0xffff /* Length - real mode default value */ - .long pa_machine_real_restart_gdt - .word 0 - - /* - * 16-bit code segment pointing to real_mode_seg - * Selector value 8 - */ - .word 0xffff /* Limit */ - .long 0x9b000000 + pa_real_mode_base - .word 0 - - /* - * 16-bit data segment with the selector value 16 = 0x10 and - * base value 0x100; since this is consistent with real mode - * semantics we don't have to reload the segments once CR0.PE = 0. - */ - .quad GDT_ENTRY(0x0093, 0x100, 0xffff) -END(machine_real_restart_gdt) -- cgit v1.2.3-70-g09d2 From abf71f3066740f3b59c3f731b4b68ed335f7b24d Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Fri, 15 Jun 2012 18:10:55 +0300 Subject: x86/vsmp: Fix vector_allocation_domain's return value Commit 8637e38a ("x86/apic: Avoid useless scanning thru a cpumask in assign_irq_vector()") modified vector_allocation_domain() to return a boolean indicating if cpumask is dynamic or static. Adjust vSMP's callback implementation accordingly. Signed-off-by: Ido Yariv Acked-by: Shai Fultheim Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/1339773055-27397-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 3f0285ac00f..fa5adb7c228 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -208,9 +208,10 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) * In vSMP, all cpus should be capable of handling interrupts, regardless of * the APIC used. */ -static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +static bool fill_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_setall(retmask); + return false; } static void vsmp_apic_post_init(void) -- cgit v1.2.3-70-g09d2 From 76958a61e42fb6277a8431eb17e4bdb24176f1b7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 15 Jun 2012 19:06:44 +0200 Subject: perf/x86/amd: Fix RDPMC index calculation for AMD family 15h The RDPMC index calculation is wrong for AMD family 15h (X86_FEATURE_ PERFCTR_CORE set). This leads to a #GP when accessing the counter: Pid: 2237, comm: syslog-ng Not tainted 3.5.0-rc1-perf-x86_64-standard-g130ff90 #135 AMD Pike/Pike RIP: 0010:[] [] x86_perf_event_update+0x27/0x66 While the msr address offset is (index << 1) we must use index to select the correct rdpmc. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 766c76d5ec4..d1f38c9509d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -823,7 +823,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); - hwc->event_base_rdpmc = x86_pmu_addr_offset(hwc->idx); + hwc->event_base_rdpmc = hwc->idx; } } -- cgit v1.2.3-70-g09d2 From 4b4969b14490a4f65b572b8f180164181104b5e1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:30 +0800 Subject: perf: Export perf_assign_events() Export perf_assign_events() so the uncore code can use it to schedule events. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +++--- arch/x86/kernel/cpu/perf_event.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d1f38c9509d..6d32aefc9db 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -626,7 +626,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ - if (x86_pmu.num_counters_fixed) { + if (c->idxmsk64 & (~0ULL << X86_PMC_IDX_FIXED)) { idx = X86_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) @@ -693,8 +693,8 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -static int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) { struct perf_sched sched; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3df3de9452a..83238f2a12b 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -481,6 +481,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); -- cgit v1.2.3-70-g09d2 From 087bfbb032691262f2f7d52b910652450c5554b8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:34 +0800 Subject: perf/x86: Add generic Intel uncore PMU support This patch adds the generic Intel uncore PMU support, including helper functions that add/delete uncore events, a hrtimer that periodically polls the counters to avoid overflow and code that places all events for a particular socket onto a single cpu. The code design is based on the structure of Sandy Bridge-EP's uncore subsystem, which consists of a variety of components, each component contains one or more "boxes". (Tooling support follows in the next patches.) Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339741902-8449-6-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 4 +- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 878 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_uncore.h | 204 ++++++ 3 files changed, 1085 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.c create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.h (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2fdfd..bac4c3804cc 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifdef CONFIG_PERF_EVENTS obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o endif obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 00000000000..fe76a07dfdb --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,878 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); + +static void uncore_assign_hw_event(struct intel_uncore_box *box, + struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_msr_fixed_ctr(box); + hwc->config_base = uncore_msr_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_msr_event_ctl(box, hwc->idx); + hwc->event_base = uncore_msr_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +struct intel_uncore_box *uncore_alloc_box(int cpu) +{ + struct intel_uncore_box *box; + + box = kmalloc_node(sizeof(*box), GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + if (!box) + return NULL; + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + return box; +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), + smp_processor_id()); +} + +static int uncore_collect_events(struct intel_uncore_box *box, + struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_event_constraint(struct intel_uncore_type *type, + struct perf_event *event) +{ + struct event_constraint *c; + + if (event->hw.config == ~0ULL) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static int uncore_assign_events(struct intel_uncore_box *box, + int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; + int i, ret, wmin, wmax; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + c = uncore_event_constraint(box->pmu->type, + box->event_list[i]); + constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + assign[i] = hwc->idx; + } + if (i == n) + return 0; + + /* slow path */ + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int assign[UNCORE_PMC_IDX_MAX]; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(smp_processor_id()); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, assign, n); +out: + kfree(fake_box); + return ret; +} + +int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + hwc->config = ~0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->attr_groups[1]); + type->attr_groups[1] = NULL; +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *events_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + events_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*events_group), GFP_KERNEL); + if (!events_group) + goto fail; + + attrs = (struct attribute **)(events_group + 1); + events_group->name = "events"; + events_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->attr_groups[1] = events_group; + } + + type->pmus = pmus; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static void __cpuinit uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + kfree(box); + } + } +} + +static int __cpuinit uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + kfree(box); + box = NULL; + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(cpu); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores, + int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void __cpuinit uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); +} + +static void __cpuinit uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); +} + +static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb __cpuinitdata = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, cpu; + + switch (boot_cpu_data.x86_model) { + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + register_cpu_notifier(&uncore_cpu_nb); + + put_online_cpus(); + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + ret = uncore_cpu_init(); + if (ret) + goto fail; + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 00000000000..49a6bfbba0d --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,204 @@ +#include +#include +#include +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_BOX_HASH_SIZE 8 + +#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xffff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + int single_fixed; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[3]; +}; + +#define format_group attr_groups[0] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct intel_uncore_pmu *pmu; + struct hrtimer hrtimer; + struct list_head list; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline +unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->event_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->perf_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} -- cgit v1.2.3-70-g09d2 From fcde10e916326545e8fec1807357c68ef08dc443 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:35 +0800 Subject: perf/x86: Add Intel Nehalem and Sandy Bridge uncore PMU support Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339741902-8449-7-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 195 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_uncore.h | 50 +++++++ 2 files changed, 245 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index fe76a07dfdb..3ed941ac374 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -10,6 +10,192 @@ static cpumask_t uncore_cpu_mask; static struct event_constraint constraint_fixed = EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 count; + rdmsrl(event->hw.event_base, count); + return count; +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, + NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), + /* full cache line writes to DRAM */ + INTEL_UNCORE_EVENT_DESC(QMC_WRITES_FULL_ANY, "event=0x2f,umask=0xf"), + /* Quickpath Memory Controller normal priority read requests */ + INTEL_UNCORE_EVENT_DESC(QMC_NORMAL_READS_ANY, "event=0x2c,umask=0xf"), + /* Quickpath Home Logic read requests from the IOH */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_READS, + "event=0x20,umask=0x1"), + /* Quickpath Home Logic write requests from the IOH */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_WRITES, + "event=0x20,umask=0x2"), + /* Quickpath Home Logic read requests from a remote socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_READS, + "event=0x20,umask=0x4"), + /* Quickpath Home Logic write requests from a remote socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_WRITES, + "event=0x20,umask=0x8"), + /* Quickpath Home Logic read requests from the local socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_READS, + "event=0x20,umask=0x10"), + /* Quickpath Home Logic write requests from the local socket */ + INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_WRITES, + "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) { @@ -808,6 +994,15 @@ static int __init uncore_cpu_init(void) int ret, cpu; switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + msr_uncores = snb_msr_uncores; + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 49a6bfbba0d..eeb5ca5815a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -15,6 +15,56 @@ #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + + struct intel_uncore_ops; struct intel_uncore_pmu; struct intel_uncore_box; -- cgit v1.2.3-70-g09d2 From 14371cce03c2fc393997e17f979e76674b7f392a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:36 +0800 Subject: perf: Add generic PCI uncore PMU device support This patch adds generic support for uncore PMUs presented as PCI devices. (These come in addition to the CPU/MSR based uncores.) Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-8-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 175 +++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 66 ++++++++++ 2 files changed, 236 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 3ed941ac374..e20c65a0e10 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2,6 +2,11 @@ static struct intel_uncore_type *empty_uncore[] = { NULL, }; static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -205,13 +210,13 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box, hwc->last_tag = ++box->tags[idx]; if (hwc->idx == UNCORE_PMC_IDX_FIXED) { - hwc->event_base = uncore_msr_fixed_ctr(box); - hwc->config_base = uncore_msr_fixed_ctl(box); + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); return; } - hwc->config_base = uncore_msr_event_ctl(box, hwc->idx); - hwc->event_base = uncore_msr_perf_ctr(box, hwc->idx); + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); } static void uncore_perf_event_update(struct intel_uncore_box *box, @@ -305,6 +310,22 @@ struct intel_uncore_box *uncore_alloc_box(int cpu) static struct intel_uncore_box * uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { + static struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + return *per_cpu_ptr(pmu->box, cpu); } @@ -706,6 +727,13 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->attr_groups[1] = NULL; } +static void uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + static int __init uncore_type_init(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmus; @@ -725,6 +753,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type) pmus[i].func_id = -1; pmus[i].pmu_idx = i; pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); pmus[i].box = alloc_percpu(struct intel_uncore_box *); if (!pmus[i].box) goto fail; @@ -773,6 +802,127 @@ fail: return ret; } +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int __devinit uncore_pci_add(struct intel_uncore_type *type, + struct pci_dev *pdev) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + box = uncore_alloc_box(0); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + for (i = 0; i < type->num_boxes; i++) { + pmu = &type->pmus[i]; + if (pmu->func_id == pdev->devfn) + break; + if (pmu->func_id < 0) { + pmu->func_id = pdev->devfn; + break; + } + pmu = NULL; + } + + if (!pmu) { + kfree(box); + return -EINVAL; + } + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + raw_spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + raw_spin_unlock(&uncore_box_lock); + + return 0; +} + +static void __devexit uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu = box->pmu; + int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + raw_spin_lock(&uncore_box_lock); + list_del(&box->list); + raw_spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __devinit uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + + type = (struct intel_uncore_type *)id->driver_data; + return uncore_pci_add(type, pdev); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + static void __cpuinit uncore_cpu_dying(int cpu) { struct intel_uncore_type *type; @@ -921,6 +1071,7 @@ static void __cpuinit uncore_event_exit_cpu(int cpu) cpumask_set_cpu(target, &uncore_cpu_mask); uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); } static void __cpuinit uncore_event_init_cpu(int cpu) @@ -936,6 +1087,7 @@ static void __cpuinit uncore_event_init_cpu(int cpu) cpumask_set_cpu(cpu, &uncore_cpu_mask); uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); } static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, @@ -1051,6 +1203,14 @@ static int __init uncore_pmus_register(void) } } + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + return 0; } @@ -1061,9 +1221,14 @@ static int __init intel_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; - ret = uncore_cpu_init(); + ret = uncore_pci_init(); if (ret) goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } uncore_pmus_register(); return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index eeb5ca5815a..aa01df87b8d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -1,5 +1,6 @@ #include #include +#include #include #include "perf_event.h" @@ -110,6 +111,7 @@ struct intel_uncore_pmu { int func_id; struct intel_uncore_type *type; struct intel_uncore_box ** __percpu box; + struct list_head box_list; }; struct intel_uncore_box { @@ -123,6 +125,7 @@ struct intel_uncore_box { struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; struct intel_uncore_pmu *pmu; struct hrtimer hrtimer; struct list_head list; @@ -161,6 +164,33 @@ static ssize_t uncore_event_show(struct kobject *kobj, return sprintf(buf, "%s", event->config); } +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) { @@ -200,6 +230,42 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) box->pmu->type->msr_offset * box->pmu->pmu_idx; } +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) { return box->pmu->type->perf_ctr_bits; -- cgit v1.2.3-70-g09d2 From 7c94ee2e0917b2ea56498bff939c8aa55da27207 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:37 +0800 Subject: perf/x86: Add Intel Nehalem and Sandy Bridge-EP uncore support The uncore subsystem in Sandy Bridge-EP consists of 8 components: Ubox, Cacheing Agent, Home Agent, Memory controller, Power Control, QPI Link Layer, R2PCIe, R3QPI. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339741902-8449-9-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 484 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel_uncore.h | 86 +++++ include/linux/pci_ids.h | 11 + 3 files changed, 581 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index e20c65a0e10..d34f68bf990 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -21,6 +21,482 @@ DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); + +/* Sandy Bridge-EP uncore support */ +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | + SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, + SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 count; + + rdmsrl(hwc->event_base, count); + return count; +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), + /* read */ + INTEL_UNCORE_EVENT_DESC(CAS_COUNT_RD, "event=0x4,umask=0x3"), + /* write */ + INTEL_UNCORE_EVENT_DESC(CAS_COUNT_WR, "event=0x4,umask=0xc"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "event=0x14"), + /* outgoing data+nondata flits */ + INTEL_UNCORE_EVENT_DESC(TxL_FLITS_ACTIVE, "event=0x0,umask=0x6"), + /* DRS data received */ + INTEL_UNCORE_EVENT_DESC(DRS_DATA, "event=0x2,umask=0x8"), + /* NCB data received */ + INTEL_UNCORE_EVENT_DESC(NCB_DATA, "event=0x3,umask=0x4"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snbep_uncore_msr_enable_event, + .read_counter = snbep_uncore_msr_read_counter, +}; + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1f, 0xe), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_format_group, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_descs = snbep_uncore_qpi_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + &snbep_uncore_ha, + &snbep_uncore_imc, + &snbep_uncore_qpi, + &snbep_uncore_r2pcie, + &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = (unsigned long)&snbep_uncore_ha, + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* P2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = (unsigned long)&snbep_uncore_r2pcie, + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static void snbep_pci2phy_map_init(void) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + u32 config; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX, + ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + pci_read_config_dword(ubox_dev, 0x40, &config); + nodeid = config; + /* get the Node ID mapping */ + pci_read_config_dword(ubox_dev, 0x54, &config); + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + }; + return; +} +/* end of Sandy Bridge-EP uncore support */ + /* Sandy Bridge uncore support */ static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, @@ -894,6 +1370,11 @@ static int __init uncore_pci_init(void) int ret; switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + snbep_pci2phy_map_init(); + break; default: return 0; } @@ -1155,6 +1636,9 @@ static int __init uncore_cpu_init(void) case 42: /* Sandy Bridge */ msr_uncores = snb_msr_uncores; break; + case 45: /* Sandy Birdge-EP */ + msr_uncores = snbep_msr_uncores; + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index aa01df87b8d..4d52db0d1df 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -65,6 +65,92 @@ #define NHM_UNC_PERFEVTSEL0 0x3c0 #define NHM_UNC_UNCORE_PMC0 0x3b0 +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */ +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_CBO_MSR_OFFSET 0x20 + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd struct intel_uncore_ops; struct intel_uncore_pmu; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index ab741b0d007..5f187026b81 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2755,6 +2755,17 @@ #define PCI_DEVICE_ID_INTEL_IOAT_SNB7 0x3c27 #define PCI_DEVICE_ID_INTEL_IOAT_SNB8 0x3c2e #define PCI_DEVICE_ID_INTEL_IOAT_SNB9 0x3c2f +#define PCI_DEVICE_ID_INTEL_UNC_HA 0x3c46 +#define PCI_DEVICE_ID_INTEL_UNC_IMC0 0x3cb0 +#define PCI_DEVICE_ID_INTEL_UNC_IMC1 0x3cb1 +#define PCI_DEVICE_ID_INTEL_UNC_IMC2 0x3cb4 +#define PCI_DEVICE_ID_INTEL_UNC_IMC3 0x3cb5 +#define PCI_DEVICE_ID_INTEL_UNC_QPI0 0x3c41 +#define PCI_DEVICE_ID_INTEL_UNC_QPI1 0x3c42 +#define PCI_DEVICE_ID_INTEL_UNC_R2PCIE 0x3c43 +#define PCI_DEVICE_ID_INTEL_UNC_R3QPI0 0x3c44 +#define PCI_DEVICE_ID_INTEL_UNC_R3QPI1 0x3c45 +#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0 #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f #define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 -- cgit v1.2.3-70-g09d2 From a1e4ccb990447df0fe83d164d9a7bc2e6c4b7db7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 15 Jun 2012 15:07:13 -0400 Subject: KVM: Introduce __KVM_HAVE_IRQ_LINE This is a preparatory patch for the KVM/ARM implementation. KVM/ARM will use the KVM_IRQ_LINE ioctl, which is currently conditional on __KVM_HAVE_IOAPIC, but ARM obviously doesn't have any IOAPIC support and we need a separate define. Signed-off-by: Christoffer Dall Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm.h | 1 + arch/x86/include/asm/kvm.h | 1 + include/trace/events/kvm.h | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index b9f82c84f09..ec6c6b30123 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -26,6 +26,7 @@ /* Select x86 specific features in */ #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_DEVICE_ASSIGNMENT /* Architectural interrupt line count. */ diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index e7d1c194d27..246617efd67 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -12,6 +12,7 @@ /* Select x86 specific features in */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_DEVICE_ASSIGNMENT #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 3df5925fe64..7ef9e759f49 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -37,7 +37,7 @@ TRACE_EVENT(kvm_userspace_exit, __entry->errno < 0 ? -__entry->errno : __entry->reason) ); -#if defined(__KVM_HAVE_IOAPIC) +#if defined(__KVM_HAVE_IRQ_LINE) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), TP_ARGS(gsi, level, irq_source_id), @@ -57,7 +57,9 @@ TRACE_EVENT(kvm_set_irq, TP_printk("gsi %u level %d source %d", __entry->gsi, __entry->level, __entry->irq_source_id) ); +#endif +#if defined(__KVM_HAVE_IOAPIC) #define kvm_deliver_mode \ {0x0, "Fixed"}, \ {0x1, "LowPrio"}, \ -- cgit v1.2.3-70-g09d2 From 2992c542fcd40777ed253f57362c65711fb8acaf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 16 Jun 2012 14:45:49 +0200 Subject: perf/x86: Lowercase uncore PMU event names Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ucnds8gkve4x3s4biuukyph3@git.kernel.org [ Trivial build fix ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 51 +++++++++------------------ 1 file changed, 16 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index d34f68bf990..28a8413ca19 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -179,22 +179,17 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { }; static struct uncore_event_desc snbep_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), - /* read */ - INTEL_UNCORE_EVENT_DESC(CAS_COUNT_RD, "event=0x4,umask=0x3"), - /* write */ - INTEL_UNCORE_EVENT_DESC(CAS_COUNT_WR, "event=0x4,umask=0xc"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), { /* end: all zeroes */ }, }; static struct uncore_event_desc snbep_uncore_qpi_events[] = { - INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "event=0x14"), - /* outgoing data+nondata flits */ - INTEL_UNCORE_EVENT_DESC(TxL_FLITS_ACTIVE, "event=0x0,umask=0x6"), - /* DRS data received */ - INTEL_UNCORE_EVENT_DESC(DRS_DATA, "event=0x2,umask=0x8"), - /* NCB data received */ - INTEL_UNCORE_EVENT_DESC(NCB_DATA, "event=0x3,umask=0x4"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), { /* end: all zeroes */ }, }; @@ -621,29 +616,15 @@ static struct attribute_group nhm_uncore_format_group = { }; static struct uncore_event_desc nhm_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(CLOCKTICKS, "config=0xffff"), - /* full cache line writes to DRAM */ - INTEL_UNCORE_EVENT_DESC(QMC_WRITES_FULL_ANY, "event=0x2f,umask=0xf"), - /* Quickpath Memory Controller normal priority read requests */ - INTEL_UNCORE_EVENT_DESC(QMC_NORMAL_READS_ANY, "event=0x2c,umask=0xf"), - /* Quickpath Home Logic read requests from the IOH */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_READS, - "event=0x20,umask=0x1"), - /* Quickpath Home Logic write requests from the IOH */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_IOH_WRITES, - "event=0x20,umask=0x2"), - /* Quickpath Home Logic read requests from a remote socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_READS, - "event=0x20,umask=0x4"), - /* Quickpath Home Logic write requests from a remote socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_REMOTE_WRITES, - "event=0x20,umask=0x8"), - /* Quickpath Home Logic read requests from the local socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_READS, - "event=0x20,umask=0x10"), - /* Quickpath Home Logic write requests from the local socket */ - INTEL_UNCORE_EVENT_DESC(QHL_REQUEST_LOCAL_WRITES, - "event=0x20,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), { /* end: all zeroes */ }, }; -- cgit v1.2.3-70-g09d2 From 9e40b67bf2bfaa40b28354c501a72fd001a1397a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 19 Jun 2012 22:04:56 +0900 Subject: KVM: Use kvm_kvfree() to free memory allocated by kvm_kvzalloc() The following commit did not care about the error handling path: commit c1a7b32a14138f908df52d7c53b5ce3415ec6b50 KVM: Avoid wasting pages for small lpage_info arrays If memory allocation fails, vfree() will be called with the address returned by kzalloc(). This patch fixes this issue. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a01a4241bc6..6ed5983f78f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6350,7 +6350,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) out_free: for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->arch.lpage_info[i]); + kvm_kvfree(slot->arch.lpage_info[i]); slot->arch.lpage_info[i] = NULL; } return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 0718467c859f5571dc48d294596f841096f6a47a Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Mon, 18 Jun 2012 15:56:33 -0400 Subject: x86/nmi: Clean up register_nmi_handler() usage Implement a cleaner and easier to maintain version for the section warning fixes implemented in commit eeaaa96a3a21 ("x86/nmi: Fix section mismatch warnings on 32-bit"). Signed-off-by: Li Zhong Signed-off-by: Don Zickus Cc: Jan Beulich Link: http://lkml.kernel.org/r/1340049393-17771-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 20 +++----------------- arch/x86/kernel/nmi_selftest.c | 7 ++++--- 2 files changed, 7 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index dc580c42851..c0fa356e90d 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -44,28 +44,14 @@ struct nmiaction { const char *name; }; -#define register_nmi_handler(t, fn, fg, n) \ +#define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ - static struct nmiaction fn##_na = { \ + static struct nmiaction init fn##_na = { \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ }; \ - __register_nmi_handler((t), &fn##_na); \ -}) - -/* - * For special handlers that register/unregister in the - * init section only. This should be considered rare. - */ -#define register_nmi_handler_initonly(t, fn, fg, n) \ -({ \ - static struct nmiaction fn##_na __initdata = { \ - .handler = (fn), \ - .name = (n), \ - .flags = (fg), \ - }; \ - __register_nmi_handler((t), &fn##_na); \ + __register_nmi_handler((t), &fn##_na); \ }) int __register_nmi_handler(unsigned int, struct nmiaction *); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 149b8d9c6ad..6d9582ec032 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); } static void __init cleanup_nmi_testsuite(void) @@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, - NMI_FLAG_FIRST, "nmi_selftest")) { + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { nmi_fail = FAILURE; return; } -- cgit v1.2.3-70-g09d2 From e1b6fc55da40bc17e20795901cb786e3619f9be9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 18 Jun 2012 11:28:45 +0100 Subject: x86/microcode: Mark microcode_id[] as __initconst It's not being used for other than creating module aliases (i.e. no loadable section has any reference to it). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4FDF1EFD020000780008A65D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc691718..c383b3f8f39 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -504,7 +504,7 @@ static struct notifier_block __refdata mc_cpu_notifier = { #ifdef MODULE /* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id microcode_id[] = { +static const struct x86_cpu_id __initconst microcode_id[] = { #ifdef CONFIG_MICROCODE_INTEL { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, #endif -- cgit v1.2.3-70-g09d2 From 0d26d1d873a302828e064737746c53a2689e6c0f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 18 Jun 2012 11:30:20 +0100 Subject: x86/mm: Mark free_initrd_mem() as __init ... matching various other architectures. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4FDF1F5C020000780008A661@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bc4e9d84157..e0e6990723e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -385,7 +385,7 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, -- cgit v1.2.3-70-g09d2 From 0fa0e2f02e8edfbdb5f86d1cab0fa6dc0517489f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 18 Jun 2012 11:40:04 +0100 Subject: x86: Move call to print_modules() out of show_regs() Printing the list of loaded modules is really unrelated to what this function is about, and is particularly unnecessary in the context of the SysRQ key handling (gets printed so far over and over). It should really be the caller of the function to decide whether this piece of information is useful (and to avoid redundantly printing it). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4FDF21A4020000780008A67F@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 + arch/x86/kernel/dumpstack_32.c | 1 - arch/x86/kernel/dumpstack_64.c | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 87d3b5d663c..ae42418bc50 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; + print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 3a8aced11ae..1038a417ea5 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -86,7 +86,6 @@ void show_regs(struct pt_regs *regs) { int i; - print_modules(); __show_regs(regs, !user_mode_vm(regs)); pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index c582e9c5bd1..b653675d528 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -254,7 +254,6 @@ void show_regs(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); - print_modules(); __show_regs(regs, 1); printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); -- cgit v1.2.3-70-g09d2 From e5a7286b5f1b0b5beb97275f2152bf10c4aa4204 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 21 Jun 2012 00:16:29 +0200 Subject: x86, boot: Remove ancient, unconditionally #ifdef'd out dead code Release v1.3.82 wrapped a few lines of code in an "#ifdef SAFE_RESET_DISK_CONTROLLER" and "#endif" pair. Since SAFE_RESET_DISK_CONTROLLER was never defined anywhere that was basically a verbose "#ifdef 0" and "#endif" pair. These dead lines have been in the tree for sixteen years but now the time has come to remove them. I guess the main lesson here is that if you want your dead code in the tree for a very long time you'd better be creative. A plain old "#ifdef 0" and "#endif" pair just doesn't cut it! See: http://lkml.kernel.org/r/199603301718.LAA00178@craie.inetnebr.com Signed-off-by: Paul Bolle Link: http://lkml.kernel.org/r/1340230589.1773.7.camel@x61.thuisdomein Acked-by: Jeff Epler Acked-by: Jesper Juhl Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8bbea6aa40d..fde5bde3b60 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -386,13 +386,6 @@ init_size: .long INIT_SIZE # kernel initialization size .section ".entrytext", "ax" start_of_setup: -#ifdef SAFE_RESET_DISK_CONTROLLER -# Reset the disk controller. - movw $0x0000, %ax # Reset disk controller - movb $0x80, %dl # All disks - int $0x13 -#endif - # Force %es = %ds movw %ds, %ax movw %ax, %es -- cgit v1.2.3-70-g09d2 From 2b1b712f050eaf0ac576591281446dc960c0afc5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 20 Jun 2012 21:18:14 -0700 Subject: x86, reboot: Drop redundant write of reboot_mode We write reboot_mode to BIOS location 0x472 in native_machine_emergency_restart() (reboot.c:542) already, there is no need to then write it again in machine_real_restart(). This means nothing gets written there for MRR_APM, but the APM call is a poweroff call and doesn't use this memory location. Link: http://lkml.kernel.org/n/tip-3i0pfh44c1e3jv5lab0cf7sc@git.kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 6ddb9cd0ced..6ef559f09ac 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -179,14 +179,6 @@ void __noreturn machine_real_restart(unsigned int type) write_cr3(real_mode_header->trampoline_pgd); #endif - /* - * Write 0x1234 to absolute memory location 0x472. The BIOS reads - * this on booting to tell it to "Bypass memory test (also warm - * boot)". This seems like a fairly standard thing that gets set by - * REBOOT.COM programs, and the previous reset routine did this - * too. */ - *((unsigned short *)0x472) = reboot_mode; - /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 asm volatile("jmpl *%0" : : -- cgit v1.2.3-70-g09d2 From 9751d7627582fc1cc64625d63bde9528c14f1544 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 21 Jun 2012 10:25:03 -0700 Subject: x86-64, reboot: Be more paranoid in 64-bit reboot=bios Be a bit more paranoid in the transition back to 16-bit mode. In particular, in case the kernel is residing above the 4 GiB mark, switch to the trampoline GDT, and make the jump after turning off paging a far jump. In theory, none of this should matter, but it is exactly the kind of things that broken SMM or virtualization software could trip up on. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/tip-jopx7y6g6dbcx4tpal8q0jlr@git.kernel.org --- arch/x86/realmode/rm/reboot.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index 6bf8feac555..f932ea61d1c 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -22,14 +22,18 @@ ENTRY(machine_real_restart_asm) #ifdef CONFIG_X86_64 + /* Switch to trampoline GDT as it is guaranteed < 4 GiB */ + movl $__KERNEL_DS, %eax + movl %eax, %ds + lgdtl pa_tr_gdt /* Disable paging to drop us out of long mode */ movl %cr0, %eax andl $~X86_CR0_PG, %eax movl %eax, %cr0 - jmp 1f /* "A branch" may be needed here, assume near is OK */ + ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off -1: +GLOBAL(machine_real_restart_paging_off) xorl %eax, %eax xorl %edx, %edx movl $MSR_EFER, %ecx -- cgit v1.2.3-70-g09d2 From 2a76c450bd0377f715caf313ded530290d7dc7d7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:10 +0800 Subject: x86/PCI: split out pci_mmcfg_check_reserved() for code reuse Split out pci_mmcfg_check_reserved() for code reuse, which will be used when supporting PCI host bridge hotplug. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mmconfig-shared.c | 51 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 301e325992f..f799949a08a 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -474,39 +474,38 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, return valid; } +static int __devinit pci_mmcfg_check_reserved(struct pci_mmcfg_region *cfg, + int early) +{ + if (!early && !acpi_disabled) { + if (is_mmconf_reserved(is_acpi_reserved, cfg, 0)) + return 1; + else + printk(KERN_ERR FW_BUG PREFIX + "MMCONFIG at %pR not reserved in " + "ACPI motherboard resources\n", + &cfg->res); + } + + /* Don't try to do this check unless configuration + type 1 is available. how about type 2 ?*/ + if (raw_pci_ops) + return is_mmconf_reserved(e820_all_mapped, cfg, 1); + + return 0; +} + static void __init pci_mmcfg_reject_broken(int early) { struct pci_mmcfg_region *cfg; list_for_each_entry(cfg, &pci_mmcfg_list, list) { - int valid = 0; - - if (!early && !acpi_disabled) { - valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); - - if (valid) - continue; - else - printk(KERN_ERR FW_BUG PREFIX - "MMCONFIG at %pR not reserved in " - "ACPI motherboard resources\n", - &cfg->res); + if (pci_mmcfg_check_reserved(cfg, early) == 0) { + printk(KERN_INFO PREFIX "not using MMCONFIG\n"); + free_all_mmcfg(); + return; } - - /* Don't try to do this check unless configuration - type 1 is available. how about type 2 ?*/ - if (raw_pci_ops) - valid = is_mmconf_reserved(e820_all_mapped, cfg, 1); - - if (!valid) - goto reject; } - - return; - -reject: - printk(KERN_INFO PREFIX "not using MMCONFIG\n"); - free_all_mmcfg(); } static int __initdata known_bridge; -- cgit v1.2.3-70-g09d2 From 846e402300ffa2131239dcf82265b5366cd755f4 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:11 +0800 Subject: x86/PCI: split out pci_mmconfig_alloc() for code reuse Split out pci_mmconfig_alloc() for code reuse, which will be used when supporting PCI root bridge hotplug. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mmconfig-shared.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index f799949a08a..5e2cd2aa288 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -61,8 +61,9 @@ static __init void list_add_sorted(struct pci_mmcfg_region *new) list_add_tail(&new->list, &pci_mmcfg_list); } -static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, - int end, u64 addr) +static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, + int start, + int end, u64 addr) { struct pci_mmcfg_region *new; struct resource *res; @@ -79,8 +80,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, new->start_bus = start; new->end_bus = end; - list_add_sorted(new); - res = &new->res; res->start = addr + PCI_MMCFG_BUS_OFFSET(start); res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; @@ -96,6 +95,18 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, return new; } +static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, + int end, u64 addr) +{ + struct pci_mmcfg_region *new; + + new = pci_mmconfig_alloc(segment, start, end, addr); + if (new) + list_add_sorted(new); + + return new; +} + struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { struct pci_mmcfg_region *cfg; -- cgit v1.2.3-70-g09d2 From 376f70acfe4bd97493299cdfc00a8d235279d267 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:12 +0800 Subject: x86/PCI: use RCU list to protect mmconfig list Use RCU list to protect mmconfig list from dynamic change when supporting PCI host bridge hotplug. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mmconfig-shared.c | 18 ++++++++++++------ arch/x86/pci/mmconfig_32.c | 13 +++++++++++-- arch/x86/pci/mmconfig_64.c | 13 +++++++++++-- 3 files changed, 34 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 5e2cd2aa288..0ac97d54bca 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -25,6 +27,7 @@ /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; +static DEFINE_MUTEX(pci_mmcfg_lock); LIST_HEAD(pci_mmcfg_list); @@ -45,20 +48,20 @@ static __init void free_all_mmcfg(void) pci_mmconfig_remove(cfg); } -static __init void list_add_sorted(struct pci_mmcfg_region *new) +static __devinit void list_add_sorted(struct pci_mmcfg_region *new) { struct pci_mmcfg_region *cfg; /* keep list sorted by segment and starting bus number */ - list_for_each_entry(cfg, &pci_mmcfg_list, list) { + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) { if (cfg->segment > new->segment || (cfg->segment == new->segment && cfg->start_bus >= new->start_bus)) { - list_add_tail(&new->list, &cfg->list); + list_add_tail_rcu(&new->list, &cfg->list); return; } } - list_add_tail(&new->list, &pci_mmcfg_list); + list_add_tail_rcu(&new->list, &pci_mmcfg_list); } static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, @@ -101,8 +104,11 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, struct pci_mmcfg_region *new; new = pci_mmconfig_alloc(segment, start, end, addr); - if (new) + if (new) { + mutex_lock(&pci_mmcfg_lock); list_add_sorted(new); + mutex_unlock(&pci_mmcfg_lock); + } return new; } @@ -111,7 +117,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { struct pci_mmcfg_region *cfg; - list_for_each_entry(cfg, &pci_mmcfg_list, list) + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) if (cfg->segment == segment && cfg->start_bus <= bus && bus <= cfg->end_bus) return cfg; diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 5372e86834c..5dad04aa6b3 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -60,9 +61,12 @@ err: *value = -1; return -EINVAL; } + rcu_read_lock(); base = get_base_addr(seg, bus, devfn); - if (!base) + if (!base) { + rcu_read_unlock(); goto err; + } raw_spin_lock_irqsave(&pci_config_lock, flags); @@ -80,6 +84,7 @@ err: *value = -1; break; } raw_spin_unlock_irqrestore(&pci_config_lock, flags); + rcu_read_unlock(); return 0; } @@ -93,9 +98,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, if ((bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; + rcu_read_lock(); base = get_base_addr(seg, bus, devfn); - if (!base) + if (!base) { + rcu_read_unlock(); return -EINVAL; + } raw_spin_lock_irqsave(&pci_config_lock, flags); @@ -113,6 +121,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, break; } raw_spin_unlock_irqrestore(&pci_config_lock, flags); + rcu_read_unlock(); return 0; } diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 915a493502c..acc48c5b686 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -34,9 +35,12 @@ err: *value = -1; return -EINVAL; } + rcu_read_lock(); addr = pci_dev_base(seg, bus, devfn); - if (!addr) + if (!addr) { + rcu_read_unlock(); goto err; + } switch (len) { case 1: @@ -49,6 +53,7 @@ err: *value = -1; *value = mmio_config_readl(addr + reg); break; } + rcu_read_unlock(); return 0; } @@ -62,9 +67,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) return -EINVAL; + rcu_read_lock(); addr = pci_dev_base(seg, bus, devfn); - if (!addr) + if (!addr) { + rcu_read_unlock(); return -EINVAL; + } switch (len) { case 1: @@ -77,6 +85,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, mmio_config_writel(addr + reg, value); break; } + rcu_read_unlock(); return 0; } -- cgit v1.2.3-70-g09d2 From 9cf0105da5a315677d8f91043fb87fdade0d8b39 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:13 +0800 Subject: x86/PCI: introduce pci_mmcfg_arch_map()/pci_mmcfg_arch_unmap() Introduce pci_mmcfg_arch_map()/pci_mmcfg_arch_unmap(), which will be used when supporting PCI root bridge hotplug. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/include/asm/pci_x86.h | 2 ++ arch/x86/pci/mmconfig_32.c | 15 +++++++++++++++ arch/x86/pci/mmconfig_64.c | 38 ++++++++++++++++++++++++++------------ 3 files changed, 43 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b3a53174602..df898ceab4d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -135,6 +135,8 @@ struct pci_mmcfg_region { extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); +extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg); +extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg); extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); extern struct list_head pci_mmcfg_list; diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 5dad04aa6b3..a22785deb50 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -141,3 +141,18 @@ int __init pci_mmcfg_arch_init(void) void __init pci_mmcfg_arch_free(void) { } + +int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) +{ + return 0; +} + +void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg) +{ + unsigned long flags; + + /* Invalidate the cached mmcfg map entry. */ + raw_spin_lock_irqsave(&pci_config_lock, flags); + mmcfg_last_accessed_device = 0; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index acc48c5b686..ebefea5107a 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -95,7 +95,7 @@ static const struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) +static void __iomem * __devinit mcfg_ioremap(struct pci_mmcfg_region *cfg) { void __iomem *addr; u64 start, size; @@ -114,16 +114,14 @@ int __init pci_mmcfg_arch_init(void) { struct pci_mmcfg_region *cfg; - list_for_each_entry(cfg, &pci_mmcfg_list, list) { - cfg->virt = mcfg_ioremap(cfg); - if (!cfg->virt) { - printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n", - &cfg->res); + list_for_each_entry(cfg, &pci_mmcfg_list, list) + if (pci_mmcfg_arch_map(cfg)) { pci_mmcfg_arch_free(); return 0; } - } + raw_pci_ext_ops = &pci_mmcfg; + return 1; } @@ -131,10 +129,26 @@ void __init pci_mmcfg_arch_free(void) { struct pci_mmcfg_region *cfg; - list_for_each_entry(cfg, &pci_mmcfg_list, list) { - if (cfg->virt) { - iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); - cfg->virt = NULL; - } + list_for_each_entry(cfg, &pci_mmcfg_list, list) + pci_mmcfg_arch_unmap(cfg); +} + +int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) +{ + cfg->virt = mcfg_ioremap(cfg); + if (!cfg->virt) { + printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n", + &cfg->res); + return -ENOMEM; + } + + return 0; +} + +void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg) +{ + if (cfg && cfg->virt) { + iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); + cfg->virt = NULL; } } -- cgit v1.2.3-70-g09d2 From 95c5e92f4f691bbaba40bbf3decfc8e13b6ea897 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:14 +0800 Subject: x86/PCI: prepare pci_mmcfg_check_reserved() to be called at runtime Prepare function pci_mmcfg_check_reserved() to be called at runtime for PCI host bridge hot-plugging Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mmconfig-shared.c | 104 +++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 0ac97d54bca..15a7abf5139 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -27,6 +27,7 @@ /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; +static bool pci_mmcfg_running_state; static DEFINE_MUTEX(pci_mmcfg_lock); LIST_HEAD(pci_mmcfg_list); @@ -375,14 +376,15 @@ static void __init pci_mmcfg_insert_resources(void) struct pci_mmcfg_region *cfg; list_for_each_entry(cfg, &pci_mmcfg_list, list) - insert_resource(&iomem_resource, &cfg->res); + if (!cfg->res.parent) + insert_resource(&iomem_resource, &cfg->res); /* Mark that the resources have been inserted. */ pci_mmcfg_resources_inserted = 1; } -static acpi_status __init check_mcfg_resource(struct acpi_resource *res, - void *data) +static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res, + void *data) { struct resource *mcfg_res = data; struct acpi_resource_address64 address; @@ -418,8 +420,8 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res, return AE_OK; } -static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, - void *context, void **rv) +static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl, + void *context, void **rv) { struct resource *mcfg_res = context; @@ -432,7 +434,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, return AE_OK; } -static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) +static int __devinit is_acpi_reserved(u64 start, u64 end, unsigned not_used) { struct resource mcfg_res; @@ -451,13 +453,15 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); -static int __init is_mmconf_reserved(check_reserved_t is_reserved, - struct pci_mmcfg_region *cfg, int with_e820) +static int __ref is_mmconf_reserved(check_reserved_t is_reserved, + struct pci_mmcfg_region *cfg, + struct device *dev, int with_e820) { u64 addr = cfg->res.start; u64 size = resource_size(&cfg->res); u64 old_size = size; - int valid = 0, num_buses; + int num_buses; + char *method = with_e820 ? "E820" : "ACPI motherboard resources"; while (!is_reserved(addr, addr + size, E820_RESERVED)) { size >>= 1; @@ -465,49 +469,75 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, break; } - if (size >= (16UL<<20) || size == old_size) { - printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", - &cfg->res, - with_e820 ? "E820" : "ACPI motherboard resources"); - valid = 1; - - if (old_size != size) { - /* update end_bus */ - cfg->end_bus = cfg->start_bus + ((size>>20) - 1); - num_buses = cfg->end_bus - cfg->start_bus + 1; - cfg->res.end = cfg->res.start + - PCI_MMCFG_BUS_OFFSET(num_buses) - 1; - snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, - "PCI MMCONFIG %04x [bus %02x-%02x]", - cfg->segment, cfg->start_bus, cfg->end_bus); + if (size < (16UL<<20) && size != old_size) + return 0; + + if (dev) + dev_info(dev, "MMCONFIG at %pR reserved in %s\n", + &cfg->res, method); + else + printk(KERN_INFO PREFIX + "MMCONFIG at %pR reserved in %s\n", + &cfg->res, method); + + if (old_size != size) { + /* update end_bus */ + cfg->end_bus = cfg->start_bus + ((size>>20) - 1); + num_buses = cfg->end_bus - cfg->start_bus + 1; + cfg->res.end = cfg->res.start + + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %04x [bus %02x-%02x]", + cfg->segment, cfg->start_bus, cfg->end_bus); + + if (dev) + dev_info(dev, + "MMCONFIG " + "at %pR (base %#lx) (size reduced!)\n", + &cfg->res, (unsigned long) cfg->address); + else printk(KERN_INFO PREFIX - "MMCONFIG for %04x [bus%02x-%02x] " - "at %pR (base %#lx) (size reduced!)\n", - cfg->segment, cfg->start_bus, cfg->end_bus, - &cfg->res, (unsigned long) cfg->address); - } + "MMCONFIG for %04x [bus%02x-%02x] " + "at %pR (base %#lx) (size reduced!)\n", + cfg->segment, cfg->start_bus, cfg->end_bus, + &cfg->res, (unsigned long) cfg->address); } - return valid; + return 1; } -static int __devinit pci_mmcfg_check_reserved(struct pci_mmcfg_region *cfg, - int early) +static int __ref pci_mmcfg_check_reserved(struct device *dev, + struct pci_mmcfg_region *cfg, int early) { if (!early && !acpi_disabled) { - if (is_mmconf_reserved(is_acpi_reserved, cfg, 0)) + if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0)) return 1; + + if (dev) + dev_info(dev, FW_INFO + "MMCONFIG at %pR not reserved in " + "ACPI motherboard resources\n", + &cfg->res); else - printk(KERN_ERR FW_BUG PREFIX + printk(KERN_INFO FW_INFO PREFIX "MMCONFIG at %pR not reserved in " "ACPI motherboard resources\n", &cfg->res); } + /* + * e820_all_mapped() is marked as __init. + * All entries from ACPI MCFG table have been checked at boot time. + * For MCFG information constructed from hotpluggable host bridge's + * _CBA method, just assume it's reserved. + */ + if (pci_mmcfg_running_state) + return 1; + /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) - return is_mmconf_reserved(e820_all_mapped, cfg, 1); + return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1); return 0; } @@ -517,7 +547,7 @@ static void __init pci_mmcfg_reject_broken(int early) struct pci_mmcfg_region *cfg; list_for_each_entry(cfg, &pci_mmcfg_list, list) { - if (pci_mmcfg_check_reserved(cfg, early) == 0) { + if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) { printk(KERN_INFO PREFIX "not using MMCONFIG\n"); free_all_mmcfg(); return; @@ -656,6 +686,8 @@ void __init pci_mmcfg_late_init(void) static int __init pci_mmcfg_late_insert_resources(void) { + pci_mmcfg_running_state = true; + /* * If resources are already inserted or we are not using MMCONFIG, * don't insert the resources. -- cgit v1.2.3-70-g09d2 From 9c95111b330d2ddf851444528a7608f267cbb50c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:15 +0800 Subject: x86/PCI: add pci_mmconfig_insert()/delete() for PCI root bridge hotplug Introduce pci_mmconfig_insert()/pci_mmconfig_delete(), which will be used to update MMCONFIG information when supporting PCI root bridge hotplug. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/include/asm/pci_x86.h | 4 ++ arch/x86/pci/mmconfig-shared.c | 109 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 109 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index df898ceab4d..af5018f3d7c 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -137,6 +137,10 @@ extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg); extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg); +extern int __devinit pci_mmconfig_insert(struct device *dev, + u16 seg, u8 start, + u8 end, phys_addr_t addr); +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end); extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); extern struct list_head pci_mmcfg_list; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 15a7abf5139..19fc42b9f82 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -28,6 +28,7 @@ /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; static bool pci_mmcfg_running_state; +static bool pci_mmcfg_arch_init_failed; static DEFINE_MUTEX(pci_mmcfg_lock); LIST_HEAD(pci_mmcfg_list); @@ -92,10 +93,6 @@ static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); res->name = new->name; - printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " - "%pR (base %#lx)\n", segment, start, end, &new->res, - (unsigned long) addr); - return new; } @@ -109,6 +106,11 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, mutex_lock(&pci_mmcfg_lock); list_add_sorted(new); mutex_unlock(&pci_mmcfg_lock); + + printk(KERN_INFO PREFIX + "MMCONFIG for domain %04x [bus %02x-%02x] at %pR " + "(base %#lx)\n", + segment, start, end, &new->res, (unsigned long)addr); } return new; @@ -671,6 +673,7 @@ static void __init __pci_mmcfg_init(int early) * the architecture mmcfg setup could not initialize. */ pci_mmcfg_resources_inserted = 1; + pci_mmcfg_arch_init_failed = true; } } @@ -713,3 +716,101 @@ static int __init pci_mmcfg_late_insert_resources(void) * with other system resources. */ late_initcall(pci_mmcfg_late_insert_resources); + +/* Add MMCFG information for host bridges */ +int __devinit pci_mmconfig_insert(struct device *dev, + u16 seg, u8 start, u8 end, + phys_addr_t addr) +{ + int rc; + struct resource *tmp = NULL; + struct pci_mmcfg_region *cfg; + + if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed) + return -ENODEV; + + if (start > end) + return -EINVAL; + + mutex_lock(&pci_mmcfg_lock); + cfg = pci_mmconfig_lookup(seg, start); + if (cfg) { + if (cfg->end_bus < end) + dev_info(dev, FW_INFO + "MMCONFIG for " + "domain %04x [bus %02x-%02x] " + "only partially covers this bridge\n", + cfg->segment, cfg->start_bus, cfg->end_bus); + mutex_unlock(&pci_mmcfg_lock); + return -EEXIST; + } + + if (!addr) { + mutex_unlock(&pci_mmcfg_lock); + return -EINVAL; + } + + rc = -EBUSY; + cfg = pci_mmconfig_alloc(seg, start, end, addr); + if (cfg == NULL) { + dev_warn(dev, "fail to add MMCONFIG (out of memory)\n"); + rc = -ENOMEM; + } else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) { + dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n", + &cfg->res); + } else { + /* Insert resource if it's not in boot stage */ + if (pci_mmcfg_running_state) + tmp = insert_resource_conflict(&iomem_resource, + &cfg->res); + + if (tmp) { + dev_warn(dev, + "MMCONFIG %pR conflicts with " + "%s %pR\n", + &cfg->res, tmp->name, tmp); + } else if (pci_mmcfg_arch_map(cfg)) { + dev_warn(dev, "fail to map MMCONFIG %pR.\n", + &cfg->res); + } else { + list_add_sorted(cfg); + dev_info(dev, "MMCONFIG at %pR (base %#lx)\n", + &cfg->res, (unsigned long)addr); + cfg = NULL; + rc = 0; + } + } + + if (cfg) { + if (cfg->res.parent) + release_resource(&cfg->res); + kfree(cfg); + } + + mutex_unlock(&pci_mmcfg_lock); + + return rc; +} + +/* Delete MMCFG information for host bridges */ +int pci_mmconfig_delete(u16 seg, u8 start, u8 end) +{ + struct pci_mmcfg_region *cfg; + + mutex_lock(&pci_mmcfg_lock); + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) + if (cfg->segment == seg && cfg->start_bus == start && + cfg->end_bus == end) { + list_del_rcu(&cfg->list); + synchronize_rcu(); + pci_mmcfg_arch_unmap(cfg); + if (cfg->res.parent) + release_resource(&cfg->res); + mutex_unlock(&pci_mmcfg_lock); + kfree(cfg); + return 0; + } + mutex_unlock(&pci_mmcfg_lock); + + return -ENOENT; +} -- cgit v1.2.3-70-g09d2 From c0fa40784cce9cc66b54499a3762cfe07e35353f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:17 +0800 Subject: x86/PCI: update MMCONFIG information when hot-plugging PCI host bridges This patch enhances x86 arch-specific code to update MMCONFIG information when PCI host bridge hotplug event happens. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/acpi.c | 93 ++++++++++++++++++++++++++++++++++++++++-- arch/x86/pci/mmconfig_32.c | 2 +- arch/x86/pci/mmconfig_64.c | 2 +- 4 files changed, 93 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index af5018f3d7c..b2652e95b3d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -100,6 +100,7 @@ struct pci_raw_ops { extern const struct pci_raw_ops *raw_pci_ops; extern const struct pci_raw_ops *raw_pci_ext_ops; +extern const struct pci_raw_ops pci_mmcfg; extern const struct pci_raw_ops pci_direct_conf1; extern bool port_cf9_safe; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2bb885afe10..912b54b26d6 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -13,6 +13,12 @@ struct pci_root_info { unsigned int res_num; struct resource *res; struct pci_sysdata sd; +#ifdef CONFIG_PCI_MMCONFIG + bool mcfg_added; + u16 segment; + u8 start_bus; + u8 end_bus; +#endif }; static bool pci_use_crs = true; @@ -119,6 +125,81 @@ void __init pci_acpi_crs_quirks(void) pci_use_crs ? "nocrs" : "use_crs"); } +#ifdef CONFIG_PCI_MMCONFIG +static int __devinit check_segment(u16 seg, struct device *dev, char *estr) +{ + if (seg) { + dev_err(dev, + "%s can't access PCI configuration " + "space under this host bridge.\n", + estr); + return -EIO; + } + + /* + * Failure in adding MMCFG information is not fatal, + * just can't access extended configuration space of + * devices under this host bridge. + */ + dev_warn(dev, + "%s can't access extended PCI configuration " + "space under this bridge.\n", + estr); + + return 0; +} + +static int __devinit setup_mcfg_map(struct pci_root_info *info, + u16 seg, u8 start, u8 end, + phys_addr_t addr) +{ + int result; + struct device *dev = &info->bridge->dev; + + info->start_bus = start; + info->end_bus = end; + info->mcfg_added = false; + + /* return success if MMCFG is not in use */ + if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg) + return 0; + + if (!(pci_probe & PCI_PROBE_MMCONF)) + return check_segment(seg, dev, "MMCONFIG is disabled,"); + + result = pci_mmconfig_insert(dev, seg, start, end, addr); + if (result == 0) { + /* enable MMCFG if it hasn't been enabled yet */ + if (raw_pci_ext_ops == NULL) + raw_pci_ext_ops = &pci_mmcfg; + info->mcfg_added = true; + } else if (result != -EEXIST) + return check_segment(seg, dev, + "fail to add MMCONFIG information,"); + + return 0; +} + +static void teardown_mcfg_map(struct pci_root_info *info) +{ + if (info->mcfg_added) { + pci_mmconfig_delete(info->segment, info->start_bus, + info->end_bus); + info->mcfg_added = false; + } +} +#else +static int __devinit setup_mcfg_map(struct pci_root_info *info, + u16 seg, u8 start, u8 end, + phys_addr_t addr) +{ + return 0; +} +static void teardown_mcfg_map(struct pci_root_info *info) +{ +} +#endif + static acpi_status resource_to_addr(struct acpi_resource *resource, struct acpi_resource_address64 *addr) @@ -331,8 +412,11 @@ static void __release_pci_root_info(struct pci_root_info *info) free_pci_root_info_res(info); + teardown_mcfg_map(info); + kfree(info); } + static void release_pci_root_info(struct pci_host_bridge *bridge) { struct pci_root_info *info = bridge->release_data; @@ -372,7 +456,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) int domain = root->segment; int busnum = root->secondary.start; LIST_HEAD(resources); - struct pci_bus *bus; + struct pci_bus *bus = NULL; struct pci_sysdata *sd; int node; #ifdef CONFIG_ACPI_NUMA @@ -438,8 +522,11 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) x86_pci_root_bus_resources(busnum, &resources); } - bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, - &resources); + if (!setup_mcfg_map(info, domain, (u8)root->secondary.start, + (u8)root->secondary.end, root->mcfg_addr)) + bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, + sd, &resources); + if (bus) { pci_scan_child_bus(bus); pci_set_host_bridge_release( diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index a22785deb50..db63ac23e3d 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -126,7 +126,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, return 0; } -static const struct pci_raw_ops pci_mmcfg = { +const struct pci_raw_ops pci_mmcfg = { .read = pci_mmcfg_read, .write = pci_mmcfg_write, }; diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index ebefea5107a..c206521fe98 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -90,7 +90,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, return 0; } -static const struct pci_raw_ops pci_mmcfg = { +const struct pci_raw_ops pci_mmcfg = { .read = pci_mmcfg_read, .write = pci_mmcfg_write, }; -- cgit v1.2.3-70-g09d2 From 66e8850a2a34e6c52105d92a0f0054b304cb7140 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:18 +0800 Subject: x86/PCI: simplify pci_mmcfg_late_insert_resources() Reduce redundant code to simplify pci_mmcfg_late_insert_resources(). Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mmconfig-shared.c | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 19fc42b9f82..332fabdeff4 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -26,7 +26,6 @@ #define PREFIX "PCI: " /* Indicate if the mmcfg resources have been placed into the resource table. */ -static int __initdata pci_mmcfg_resources_inserted; static bool pci_mmcfg_running_state; static bool pci_mmcfg_arch_init_failed; static DEFINE_MUTEX(pci_mmcfg_lock); @@ -373,18 +372,6 @@ static int __init pci_mmcfg_check_hostbridge(void) return !list_empty(&pci_mmcfg_list); } -static void __init pci_mmcfg_insert_resources(void) -{ - struct pci_mmcfg_region *cfg; - - list_for_each_entry(cfg, &pci_mmcfg_list, list) - if (!cfg->res.parent) - insert_resource(&iomem_resource, &cfg->res); - - /* Mark that the resources have been inserted. */ - pci_mmcfg_resources_inserted = 1; -} - static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res, void *data) { @@ -668,11 +655,7 @@ static void __init __pci_mmcfg_init(int early) if (pci_mmcfg_arch_init()) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; else { - /* - * Signal not to attempt to insert mmcfg resources because - * the architecture mmcfg setup could not initialize. - */ - pci_mmcfg_resources_inserted = 1; + free_all_mmcfg(); pci_mmcfg_arch_init_failed = true; } } @@ -689,15 +672,12 @@ void __init pci_mmcfg_late_init(void) static int __init pci_mmcfg_late_insert_resources(void) { + struct pci_mmcfg_region *cfg; + pci_mmcfg_running_state = true; - /* - * If resources are already inserted or we are not using MMCONFIG, - * don't insert the resources. - */ - if ((pci_mmcfg_resources_inserted == 1) || - (pci_probe & PCI_PROBE_MMCONF) == 0 || - list_empty(&pci_mmcfg_list)) + /* If we are not using MMCONFIG, don't insert the resources. */ + if ((pci_probe & PCI_PROBE_MMCONF) == 0) return 1; /* @@ -705,7 +685,9 @@ static int __init pci_mmcfg_late_insert_resources(void) * marked so it won't cause request errors when __request_region is * called. */ - pci_mmcfg_insert_resources(); + list_for_each_entry(cfg, &pci_mmcfg_list, list) + if (!cfg->res.parent) + insert_resource(&iomem_resource, &cfg->res); return 0; } -- cgit v1.2.3-70-g09d2 From 8503562fd4e8e261bd7ca442705c6e8f0fd88228 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:19 +0800 Subject: x86/PCI: get rid of redundant log messages For each resource of a PCI host bridge, the arch code and PCI code log following messages. We don't need both, so drop the arch-specific printing. pci_root PNP0A08:00: host bridge window [io 0x0000-0x03af] pci_bus 0000:00: root bus resource [io 0x0000-0x03af] Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 912b54b26d6..505acdd6d60 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -314,13 +314,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } info->res_num++; - if (addr.translation_offset) - dev_info(&info->bridge->dev, "host bridge window %pR " - "(PCI address [%#llx-%#llx])\n", - res, res->start - addr.translation_offset, - res->end - addr.translation_offset); - else - dev_info(&info->bridge->dev, "host bridge window %pR\n", res); return AE_OK; } -- cgit v1.2.3-70-g09d2 From 574a59414083df3911e5a1514742959b412b6947 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:20 +0800 Subject: x86/PCI: refine __pci_mmcfg_init() for better code readability Refine __pci_mmcfg_init() for better code readability. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mmconfig-shared.c | 46 +++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 332fabdeff4..ffe72b9d686 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -544,8 +544,6 @@ static void __init pci_mmcfg_reject_broken(int early) } } -static int __initdata known_bridge; - static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, struct acpi_mcfg_allocation *cfg) { @@ -617,28 +615,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) static void __init __pci_mmcfg_init(int early) { - /* MMCONFIG disabled */ - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return; - - /* MMCONFIG already enabled */ - if (!early && !(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF)) - return; - - /* for late to exit */ - if (known_bridge) - return; - - if (early) { - if (pci_mmcfg_check_hostbridge()) - known_bridge = 1; - } - - if (!known_bridge) - acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); - pci_mmcfg_reject_broken(early); - if (list_empty(&pci_mmcfg_list)) return; @@ -660,14 +637,33 @@ static void __init __pci_mmcfg_init(int early) } } +static int __initdata known_bridge; + void __init pci_mmcfg_early_init(void) { - __pci_mmcfg_init(1); + if (pci_probe & PCI_PROBE_MMCONF) { + if (pci_mmcfg_check_hostbridge()) + known_bridge = 1; + else + acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); + __pci_mmcfg_init(1); + } } void __init pci_mmcfg_late_init(void) { - __pci_mmcfg_init(0); + /* MMCONFIG disabled */ + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return; + + if (known_bridge) + return; + + /* MMCONFIG hasn't been enabled yet, try again */ + if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) { + acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); + __pci_mmcfg_init(0); + } } static int __init pci_mmcfg_late_insert_resources(void) -- cgit v1.2.3-70-g09d2 From 24c97f04c4570e02c5cf4b97c73ab9dc27bacdbe Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Jun 2012 14:55:22 +0800 Subject: x86/PCI: use pr_level() to replace printk(KERN_LEVEL) Script checkpatch.pl recommends to replace printk(KERN_LVL) with pr_lvl(), so do it. Reviewed-by: Yinghai Lu Signed-off-by: Jiang Liu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mmconfig-shared.c | 21 +++++++++------------ arch/x86/pci/mmconfig_64.c | 3 +-- 2 files changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index ffe72b9d686..937bcece700 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -106,7 +106,7 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, list_add_sorted(new); mutex_unlock(&pci_mmcfg_lock); - printk(KERN_INFO PREFIX + pr_info(PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at %pR " "(base %#lx)\n", segment, start, end, &new->res, (unsigned long)addr); @@ -362,8 +362,7 @@ static int __init pci_mmcfg_check_hostbridge(void) name = pci_mmcfg_probes[i].probe(); if (name) - printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", - name); + pr_info(PREFIX "%s with MMCONFIG support\n", name); } /* some end_bus_number is crazy, fix it */ @@ -465,8 +464,7 @@ static int __ref is_mmconf_reserved(check_reserved_t is_reserved, dev_info(dev, "MMCONFIG at %pR reserved in %s\n", &cfg->res, method); else - printk(KERN_INFO PREFIX - "MMCONFIG at %pR reserved in %s\n", + pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n", &cfg->res, method); if (old_size != size) { @@ -485,7 +483,7 @@ static int __ref is_mmconf_reserved(check_reserved_t is_reserved, "at %pR (base %#lx) (size reduced!)\n", &cfg->res, (unsigned long) cfg->address); else - printk(KERN_INFO PREFIX + pr_info(PREFIX "MMCONFIG for %04x [bus%02x-%02x] " "at %pR (base %#lx) (size reduced!)\n", cfg->segment, cfg->start_bus, cfg->end_bus, @@ -508,7 +506,7 @@ static int __ref pci_mmcfg_check_reserved(struct device *dev, "ACPI motherboard resources\n", &cfg->res); else - printk(KERN_INFO FW_INFO PREFIX + pr_info(FW_INFO PREFIX "MMCONFIG at %pR not reserved in " "ACPI motherboard resources\n", &cfg->res); @@ -537,7 +535,7 @@ static void __init pci_mmcfg_reject_broken(int early) list_for_each_entry(cfg, &pci_mmcfg_list, list) { if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) { - printk(KERN_INFO PREFIX "not using MMCONFIG\n"); + pr_info(PREFIX "not using MMCONFIG\n"); free_all_mmcfg(); return; } @@ -562,7 +560,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, return 0; } - printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " + pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " "is above 4GB, ignored\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number, cfg->address); return -EINVAL; @@ -589,7 +587,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) i -= sizeof(struct acpi_mcfg_allocation); }; if (entries == 0) { - printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); + pr_err(PREFIX "MMCONFIG has no entries\n"); return -ENODEV; } @@ -603,8 +601,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number, cfg->address) == NULL) { - printk(KERN_WARNING PREFIX - "no memory for MCFG entries\n"); + pr_warn(PREFIX "no memory for MCFG entries\n"); free_all_mmcfg(); return -ENOMEM; } diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index c206521fe98..d4ebd07c306 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -137,8 +137,7 @@ int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) { cfg->virt = mcfg_ioremap(cfg); if (!cfg->virt) { - printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n", - &cfg->res); + pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res); return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 8497f696686ae1ab3f01e5956046d59844b9f500 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 23 Jun 2012 10:23:50 +0800 Subject: PCI: do not call pci_set_power_state with PCI_D3cold PCI subsystem has not been ready for D3cold support yet. So PCI_D3cold should not be used as parameter for pci_set_power_state. This patch is needed for upcoming PCI_D3cold support. This patch has no functionality change, because pci_set_power_state will bound the parameter to PCI_D3hot too. CC: Michal Miroslaw CC: Jesse Barnes Reviewed-by: Rafael J. Wysocki Signed-off-by: Huang Ying Signed-off-by: Bjorn Helgaas --- arch/x86/pci/mrst.c | 2 +- drivers/misc/cb710/core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 140942f66b3..e14a2ff708b 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -264,7 +264,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev) { - pci_set_power_state(dev, PCI_D3cold); + pci_set_power_state(dev, PCI_D3hot); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); diff --git a/drivers/misc/cb710/core.c b/drivers/misc/cb710/core.c index 85cc7710193..9d5eed75466 100644 --- a/drivers/misc/cb710/core.c +++ b/drivers/misc/cb710/core.c @@ -180,7 +180,7 @@ static int cb710_suspend(struct pci_dev *pdev, pm_message_t state) pci_save_state(pdev); pci_disable_device(pdev); if (state.event & PM_EVENT_SLEEP) - pci_set_power_state(pdev, PCI_D3cold); + pci_set_power_state(pdev, PCI_D3hot); return 0; } -- cgit v1.2.3-70-g09d2 From d9b0cde91c60da0ed5f92cdc3ac878142e6b5f27 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 29 May 2012 14:31:23 -0700 Subject: x86-64, gcc: Use -mpreferred-stack-boundary=3 if supported On x86-64, the standard ABI requires alignment to 16 bytes. However, this is not actually necessary in the kernel (we don't do SSE except in very controlled ways); and furthermore, the standard kernel entry on x86-64 actually leaves the stack on an odd 8-byte boundary, which means that gcc will generate extra instructions to keep the stack *mis*aligned! gcc 4.8 adds an -mpreferred-stack-boundary=3 option to override this and lets us save some stack space and a handful of instructions. Note that this causes us to pass -mno-sse twice; this is redundant, but necessary since the cc-option test will fail unless -mno-sse is passed on the same command line. [ hpa: rewrote the patch description ] Signed-off-by: H.J. Lu Link: http://lkml.kernel.org/r/CAMe9rOqPfy3JcZRLaUeCjBe9BVY-P6e0uaSbMi5hvS-6WwQueg@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1f252143455..b0c5276861e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -49,6 +49,9 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 + # Use -mpreferred-stack-boundary=3 if supported. + KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3) + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) -- cgit v1.2.3-70-g09d2 From 357398e96d8c883b010379a7669df43ed0e2e32b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 18:39:27 +0200 Subject: perf/x86: Fix section mismatch in uncore_pci_init() Fix section mismatch in uncore_pci_init(): WARNING: vmlinux.o(.init.text+0x9246): Section mismatch in reference from the function uncore_pci_init() to the function .devexit.text:uncore_pci_remove() The function __init uncore_pci_init() references a function __devexit uncore_pci_remove(). [...] Signed-off-by: Robert Richter Cc: Cc: Link: http://lkml.kernel.org/r/20120620163927.GI5046@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 28a8413ca19..6f43f9584e3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1313,7 +1313,7 @@ static int __devinit uncore_pci_add(struct intel_uncore_type *type, return 0; } -static void __devexit uncore_pci_remove(struct pci_dev *pdev) +static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); struct intel_uncore_pmu *pmu = box->pmu; -- cgit v1.2.3-70-g09d2 From 5eadf916dfa04c3c51397dbcb803ce8735bf191a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 24 Jun 2012 19:24:19 +0300 Subject: KVM: document lapic regs field Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d0..d29da25ea52 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,6 +13,11 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; + /** + * APIC register page. The layout matches the register layout seen by + * the guest 1:1, because it is accessed by the vmx microcode. + * Note: Only one register, the TPR, is used by the microcode. + */ void *regs; gpa_t vapic_addr; struct page *vapic_page; -- cgit v1.2.3-70-g09d2 From 8680b94b0e6046af2644c17313287ec0cb5843dc Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 24 Jun 2012 19:24:26 +0300 Subject: KVM: optimize ISR lookups We perform ISR lookups twice: during interrupt injection and on EOI. Typical workloads only have a single bit set there. So we can avoid ISR scans by 1. counting bits as we set/clear them in ISR 2. on set, caching the injected vector number 3. on clear, invalidating the cache The real purpose of this is enabling PV EOI which needs to quickly validate the vector. But non PV guests also benefit: with this patch, and without interrupt nesting, apic_find_highest_isr will always return immediately without scanning ISR. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/lapic.h | 4 ++++ 2 files changed, 55 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c15743f1e..805d887784f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap) clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int __apic_test_and_set_vector(int vec, void *bitmap) +{ + return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int __apic_test_and_clear_vector(int vec, void *bitmap) +{ + return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline int apic_hw_enabled(struct kvm_lapic *apic) { return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; @@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap) return fls(word[word_offset << 2]) - 1 + (word_offset << 5); } +static u8 count_vectors(void *bitmap) +{ + u32 *word = bitmap; + int word_offset; + u8 count = 0; + for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) + count += hweight32(word[word_offset << 2]); + return count; +} + static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; @@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) apic->irr_pending = true; } +static inline void apic_set_isr(int vec, struct kvm_lapic *apic) +{ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; +} + +static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) +{ + if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + --apic->isr_count; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -273,6 +314,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; result = find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); @@ -492,7 +537,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) if (vector == -1) return; - apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_clear_isr(vector, apic); apic_update_ppr(apic); if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && @@ -1081,6 +1126,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = false; + apic->isr_count = 0; + apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) @@ -1248,7 +1295,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) if (vector == -1) return -1; - apic_set_vector(vector, apic->regs + APIC_ISR); + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); return vector; @@ -1267,6 +1314,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->highest_isr_cache = -1; kvm_make_request(KVM_REQ_EVENT, vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d29da25ea52..5ac9e5e2fed 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,6 +13,10 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; + /* Number of bits set in ISR. */ + s16 isr_count; + /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ + int highest_isr_cache; /** * APIC register page. The layout matches the register layout seen by * the guest 1:1, because it is accessed by the vmx microcode. -- cgit v1.2.3-70-g09d2 From ab9cf4996bb989983e73da894b8dd0239aa2c3c2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 24 Jun 2012 19:24:34 +0300 Subject: KVM guest: guest side for eoi avoidance The idea is simple: there's a bit, per APIC, in guest memory, that tells the guest that it does not need EOI. Guest tests it using a single est and clear operation - this is necessary so that host can detect interrupt nesting - and if set, it can skip the EOI MSR. I run a simple microbenchmark to show exit reduction (note: for testing, need to apply follow-up patch 'kvm: host side for eoi optimization' + a qemu patch I posted separately, on host): Before: Performance counter stats for 'sleep 1s': 47,357 kvm:kvm_entry [99.98%] 0 kvm:kvm_hypercall [99.98%] 0 kvm:kvm_hv_hypercall [99.98%] 5,001 kvm:kvm_pio [99.98%] 0 kvm:kvm_cpuid [99.98%] 22,124 kvm:kvm_apic [99.98%] 49,849 kvm:kvm_exit [99.98%] 21,115 kvm:kvm_inj_virq [99.98%] 0 kvm:kvm_inj_exception [99.98%] 0 kvm:kvm_page_fault [99.98%] 22,937 kvm:kvm_msr [99.98%] 0 kvm:kvm_cr [99.98%] 0 kvm:kvm_pic_set_irq [99.98%] 0 kvm:kvm_apic_ipi [99.98%] 22,207 kvm:kvm_apic_accept_irq [99.98%] 22,421 kvm:kvm_eoi [99.98%] 0 kvm:kvm_pv_eoi [99.99%] 0 kvm:kvm_nested_vmrun [99.99%] 0 kvm:kvm_nested_intercepts [99.99%] 0 kvm:kvm_nested_vmexit [99.99%] 0 kvm:kvm_nested_vmexit_inject [99.99%] 0 kvm:kvm_nested_intr_vmexit [99.99%] 0 kvm:kvm_invlpga [99.99%] 0 kvm:kvm_skinit [99.99%] 57 kvm:kvm_emulate_insn [99.99%] 0 kvm:vcpu_match_mmio [99.99%] 0 kvm:kvm_userspace_exit [99.99%] 2 kvm:kvm_set_irq [99.99%] 2 kvm:kvm_ioapic_set_irq [99.99%] 23,609 kvm:kvm_msi_set_irq [99.99%] 1 kvm:kvm_ack_irq [99.99%] 131 kvm:kvm_mmio [99.99%] 226 kvm:kvm_fpu [100.00%] 0 kvm:kvm_age_page [100.00%] 0 kvm:kvm_try_async_get_page [100.00%] 0 kvm:kvm_async_pf_doublefault [100.00%] 0 kvm:kvm_async_pf_not_present [100.00%] 0 kvm:kvm_async_pf_ready [100.00%] 0 kvm:kvm_async_pf_completed 1.002100578 seconds time elapsed After: Performance counter stats for 'sleep 1s': 28,354 kvm:kvm_entry [99.98%] 0 kvm:kvm_hypercall [99.98%] 0 kvm:kvm_hv_hypercall [99.98%] 1,347 kvm:kvm_pio [99.98%] 0 kvm:kvm_cpuid [99.98%] 1,931 kvm:kvm_apic [99.98%] 29,595 kvm:kvm_exit [99.98%] 24,884 kvm:kvm_inj_virq [99.98%] 0 kvm:kvm_inj_exception [99.98%] 0 kvm:kvm_page_fault [99.98%] 1,986 kvm:kvm_msr [99.98%] 0 kvm:kvm_cr [99.98%] 0 kvm:kvm_pic_set_irq [99.98%] 0 kvm:kvm_apic_ipi [99.99%] 25,953 kvm:kvm_apic_accept_irq [99.99%] 26,132 kvm:kvm_eoi [99.99%] 26,593 kvm:kvm_pv_eoi [99.99%] 0 kvm:kvm_nested_vmrun [99.99%] 0 kvm:kvm_nested_intercepts [99.99%] 0 kvm:kvm_nested_vmexit [99.99%] 0 kvm:kvm_nested_vmexit_inject [99.99%] 0 kvm:kvm_nested_intr_vmexit [99.99%] 0 kvm:kvm_invlpga [99.99%] 0 kvm:kvm_skinit [99.99%] 284 kvm:kvm_emulate_insn [99.99%] 68 kvm:vcpu_match_mmio [99.99%] 68 kvm:kvm_userspace_exit [99.99%] 2 kvm:kvm_set_irq [99.99%] 2 kvm:kvm_ioapic_set_irq [99.99%] 28,288 kvm:kvm_msi_set_irq [99.99%] 1 kvm:kvm_ack_irq [99.99%] 131 kvm:kvm_mmio [100.00%] 588 kvm:kvm_fpu [100.00%] 0 kvm:kvm_age_page [100.00%] 0 kvm:kvm_try_async_get_page [100.00%] 0 kvm:kvm_async_pf_doublefault [100.00%] 0 kvm:kvm_async_pf_not_present [100.00%] 0 kvm:kvm_async_pf_ready [100.00%] 0 kvm:kvm_async_pf_completed 1.002039622 seconds time elapsed We see that # of exits is almost halved. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 7 +++++ arch/x86/kernel/kvm.c | 57 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 61 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00..2f7712e08b1 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -22,6 +22,7 @@ #define KVM_FEATURE_CLOCKSOURCE2 3 #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 +#define KVM_FEATURE_PV_EOI 6 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -37,6 +38,7 @@ #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define MSR_KVM_STEAL_TIME 0x4b564d03 +#define MSR_KVM_PV_EOI_EN 0x4b564d04 struct kvm_steal_time { __u64 steal; @@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data { __u32 enabled; }; +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 + #ifdef __KERNEL__ #include diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe..75ab94c75c7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include static int kvmapf = 1; @@ -283,6 +285,22 @@ static void kvm_register_steal_time(void) cpu, __pa(st)); } +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic->write(APIC_EOI, APIC_EOI_ACK); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -300,11 +318,20 @@ void __cpuinit kvm_guest_cpu_init(void) smp_processor_id()); } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + if (has_steal_clock) kvm_register_steal_time(); } -static void kvm_pv_disable_apf(void *unused) +static void kvm_pv_disable_apf(void) { if (!__get_cpu_var(apf_reason).enabled) return; @@ -316,11 +343,23 @@ static void kvm_pv_disable_apf(void *unused) smp_processor_id()); } +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); +} + static int kvm_pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { if (code == SYS_RESTART) - on_each_cpu(kvm_pv_disable_apf, NULL, 1); + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); return NOTIFY_DONE; } @@ -371,7 +410,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { kvm_disable_steal_time(); - kvm_pv_disable_apf(NULL); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); apf_task_wake_all(); } @@ -424,6 +465,16 @@ void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == kvm_guest_apic_eoi_write); + (*drv)->eoi_write = kvm_guest_apic_eoi_write; + } + } + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); -- cgit v1.2.3-70-g09d2 From d0a69d6321ca759bb8d47803d06ba8571ab42d07 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 24 Jun 2012 19:24:42 +0300 Subject: x86, bitops: note on __test_and_clear_bit atomicity __test_and_clear_bit is actually atomic with respect to the local CPU. Add a note saying that KVM on x86 relies on this behaviour so people don't accidentaly break it. Also warn not to rely on this in portable code. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/include/asm/bitops.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a6983b27722..72f5009deb5 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. + * + * Note: the operation is performed atomically with respect to + * the local CPU, but not other CPUs. Portable code should not + * rely on this behaviour. + * KVM relies on this behaviour on x86 for modifying memory that is also + * accessed from a hypervisor on the same CPU if running in a VM: don't change + * this without also updating arch/x86/kernel/kvm.c */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { -- cgit v1.2.3-70-g09d2 From 5cfb1d5a65dd96d2d3a0751a1e4e81dc84c1f08f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 24 Jun 2012 19:24:54 +0300 Subject: KVM: only sync when attention bits set Commit eb0dc6d0368072236dcd086d7fdc17fd3c4574d4 introduced apic attention bitmask but kvm still syncs lapic unconditionally. As that commit suggested and in anticipation of adding more attention bits, only sync lapic if(apic_attention). Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6ed5983f78f..c1f870690a6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5388,7 +5388,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.tsc_always_catchup)) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_lapic_sync_from_vapic(vcpu); + if (vcpu->arch.apic_attention) + kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(vcpu); out: -- cgit v1.2.3-70-g09d2 From d905c0693514e6f713b207377b67c9972c5d7d49 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 24 Jun 2012 19:25:00 +0300 Subject: KVM: rearrange injection cancelling code Each time we need to cancel injection we invoke same code (cancel_injection callback). Move it towards the end of function using the familiar goto on error pattern. Will make it easier to do more cleanups for PV EOI. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1f870690a6..7ea0f611bc8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5296,8 +5296,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = kvm_mmu_reload(vcpu); if (unlikely(r)) { - kvm_x86_ops->cancel_injection(vcpu); - goto out; + goto cancel_injection; } preempt_disable(); @@ -5322,9 +5321,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); r = 1; - goto out; + goto cancel_injection; } srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -5392,6 +5390,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(vcpu); + return r; + +cancel_injection: + kvm_x86_ops->cancel_injection(vcpu); out: return r; } -- cgit v1.2.3-70-g09d2 From ae7a2a3fb6f8b784c2752863f4f1f20c656f76fb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 24 Jun 2012 19:25:07 +0300 Subject: KVM: host side for eoi optimization Implementation of PV EOI using shared memory. This reduces the number of exits an interrupt causes as much as by half. The idea is simple: there's a bit, per APIC, in guest memory, that tells the guest that it does not need EOI. We set it before injecting an interrupt and clear before injecting a nested one. Guest tests it using a test and clear operation - this is necessary so that host can detect interrupt nesting - and if set, it can skip the EOI MSR. There's a new MSR to set the address of said register in guest memory. Otherwise not much changed: - Guest EOI is not required - Register is tested & ISR is automatically cleared on exit For testing results see description of previous patch 'kvm_para: guest side for eoi avoidance'. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 12 ++++ arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/lapic.c | 141 ++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/lapic.h | 2 + arch/x86/kvm/trace.h | 34 ++++++++++ arch/x86/kvm/x86.c | 7 ++ 6 files changed, 193 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db7c1f2709a..24b76474d9d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -175,6 +175,13 @@ enum { /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 +/* + * The following bit is set with PV-EOI, unset on EOI. + * We detect PV-EOI changes by guest by comparing + * this bit with PV-EOI in guest memory. + * See the implementation in apic_update_pv_eoi. + */ +#define KVM_APIC_PV_EOI_PENDING 1 /* * We don't want allocation failures within the mmu code, so we preallocate @@ -484,6 +491,11 @@ struct kvm_vcpu_arch { u64 length; u64 status; } osvw; + + struct { + u64 msr_val; + struct gfn_to_hva_cache data; + } pv_eoi; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7df1c6d839f..61ccbdf3d0a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -409,6 +409,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); if (sched_info_on()) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 805d887784f..ce878788a39 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -311,6 +311,54 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) irq->level, irq->trig_mode); } +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} + +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ + u8 val; + if (pv_eoi_get_user(vcpu, &val) < 0) + apic_debug("Can't read EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { + apic_debug("Can't set EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { + apic_debug("Can't clear EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; @@ -527,15 +575,18 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } -static void apic_set_eoi(struct kvm_lapic *apic) +static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); + + trace_kvm_eoi(apic, vector); + /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) - return; + return vector; apic_clear_isr(vector, apic); apic_update_ppr(apic); @@ -550,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + return vector; } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1132,6 +1184,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; @@ -1332,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + * Clear PV EOI in guest memory in any case. + */ +static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + bool pending; + int vector; + /* + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host + * and KVM_PV_EOI_ENABLED in guest memory as follows: + * + * KVM_APIC_PV_EOI_PENDING is unset: + * -> host disabled PV EOI. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: + * -> host enabled PV EOI, guest did not execute EOI yet. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: + * -> host enabled PV EOI, guest executed EOI. + */ + BUG_ON(!pv_eoi_enabled(vcpu)); + pending = pv_eoi_get_pending(vcpu); + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ + pv_eoi_clr_pending(vcpu); + if (pending) + return; + vector = apic_set_eoi(apic); + trace_kvm_pv_eoi(apic, vector); +} + void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; void *vapic; + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) + apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; @@ -1347,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) apic_set_tpr(vcpu->arch.apic, data & 0xff); } +/* + * apic_sync_pv_eoi_to_guest - called before vmentry + * + * Detect whether it's safe to enable PV EOI and + * if yes do so. + */ +static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + if (!pv_eoi_enabled(vcpu) || + /* IRR set or many bits in ISR: could be nested. */ + apic->irr_pending || + /* Cache not set: could be safe but we don't bother. */ + apic->highest_isr_cache == -1 || + /* Need EOI to update ioapic. */ + kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + /* + * PV EOI was disabled by apic_sync_pv_eoi_from_guest + * so we need not do anything here. + */ + return; + } + + pv_eoi_set_pending(apic->vcpu); +} + void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; - struct kvm_lapic *apic; + struct kvm_lapic *apic = vcpu->arch.apic; void *vapic; + apic_sync_pv_eoi_to_guest(vcpu, apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - apic = vcpu->arch.apic; tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) @@ -1443,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ + u64 addr = data & ~KVM_MSR_ENABLED; + if (!IS_ALIGNED(addr, 4)) + return 1; + + vcpu->arch.pv_eoi.msr_val = data; + if (!pv_eoi_enabled(vcpu)) + return 0; + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + addr); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 5ac9e5e2fed..4af5405ae1e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -69,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); #endif diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14..851914e207f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + /* * Tracepoint for nested VMRUN */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ea0f611bc8..8eacb2e6456 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -795,6 +795,7 @@ static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1653,6 +1654,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; + case MSR_KVM_PV_EOI_EN: + if (kvm_lapic_enable_pv_eoi(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: @@ -5394,6 +5399,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) cancel_injection: kvm_x86_ops->cancel_injection(vcpu); + if (unlikely(vcpu->arch.apic_attention)) + kvm_lapic_sync_from_vapic(vcpu); out: return r; } -- cgit v1.2.3-70-g09d2 From 7d43c2e42cb1e436f97c1763150e4e1122ae0d57 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 30 May 2012 14:19:55 -0600 Subject: iommu: Remove group_mf The iommu=group_mf is really no longer needed with the addition of ACS support in IOMMU drivers creating groups. Most multifunction devices will now be grouped already. If a device has gone to the trouble of exposing ACS, trust that it works. We can use the device specific ACS function for fixing devices we trust individually. This largely reverts bcb71abe. Signed-off-by: Alex Williamson Signed-off-by: Joerg Roedel --- Documentation/kernel-parameters.txt | 1 - arch/ia64/include/asm/iommu.h | 2 -- arch/ia64/kernel/pci-dma.c | 1 - arch/x86/include/asm/iommu.h | 1 - arch/x86/kernel/pci-dma.c | 11 ----------- 5 files changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a92c5ebf373..d2f4f7acc43 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1134,7 +1134,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. forcesac soft pt [x86, IA-64] - group_mf [x86, IA-64] io7= [HW] IO7 for Marvel based alpha systems diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h index b6a809fa299..105c93b00b1 100644 --- a/arch/ia64/include/asm/iommu.h +++ b/arch/ia64/include/asm/iommu.h @@ -11,12 +11,10 @@ extern void no_iommu_init(void); extern int force_iommu, no_iommu; extern int iommu_pass_through; extern int iommu_detected; -extern int iommu_group_mf; #else #define iommu_pass_through (0) #define no_iommu (1) #define iommu_detected (0) -#define iommu_group_mf (0) #endif extern void iommu_dma_init(void); extern void machvec_init(const char *name); diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c index 7cdc89b2483..1ddcfe5ef35 100644 --- a/arch/ia64/kernel/pci-dma.c +++ b/arch/ia64/kernel/pci-dma.c @@ -32,7 +32,6 @@ int force_iommu __read_mostly; #endif int iommu_pass_through; -int iommu_group_mf; /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index dffc38ee625..345c99cef15 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -5,7 +5,6 @@ extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; -extern int iommu_group_mf; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index c0f420f76cd..de2b7ad7027 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; -/* - * Group multi-function PCI devices into a single device-group for the - * iommu_device_group interface. This tells the iommu driver to pretend - * it cannot distinguish between functions of a device, exposing only one - * group for the device. Useful for disallowing use of individual PCI - * functions from userspace drivers. - */ -int iommu_group_mf __read_mostly; - extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; /* Dummy device used for NULL arguments (normally ISA). */ @@ -194,8 +185,6 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - if (!strncmp(p, "group_mf", 8)) - iommu_group_mf = 1; gart_parse_options(p); -- cgit v1.2.3-70-g09d2 From 11cab711f686893f2696a061dfca30454a624784 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 22 Jun 2012 08:12:12 -0500 Subject: x86/uv: Fix the UV BAU destination timeout period Correct the calculation of a destination timeout period, which is used to distinguish between a destination timeout and the situation where all the target software ack resources are full and a request is returned immediately. The problem is that integer arithmetic was overflowing, yielding a very large result. Without this fix destination timeouts are identified as resource 'plugged' events and an ipi method of resource releasing is unnecessarily employed. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120622131212.GA31884@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 59880afa851..0c48d438cbb 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1811,8 +1811,8 @@ static int calculate_destination_timeout(void) index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - base = timeout_base_ns[index]; - ts_ns = base * mult1 * mult2; + ts_ns = timeout_base_ns[index]; + ts_ns *= (mult1 * mult2); ret = ts_ns / 1000; } else { /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ -- cgit v1.2.3-70-g09d2 From 26ef85770c765bb8b6b6922f8a413872dd8e3979 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 22 Jun 2012 08:13:30 -0500 Subject: x86/uv: Implement UV BAU runtime enable and disable control via /proc/sgi_uv/ This patch enables the BAU to be turned on or off dynamically. echo "on" > /proc/sgi_uv/ptc_statistics echo "off" > /proc/sgi_uv/ptc_statistics The system may be booted with or without the nobau option. Whether the system currently has the BAU off can be seen in the /proc file -- normally with the baustats script. Each cpu will have a 1 in the bauoff field if the BAU was turned off, so baustats will give a count of cpus that have it off. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120622131330.GB31884@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 2 ++ arch/x86/platform/uv/tlb_uv.c | 76 ++++++++++++++++++++++++++++++++-------- 2 files changed, 63 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 6149b476d9d..847c00b721b 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -520,6 +520,7 @@ struct ptc_stats { unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ + unsigned long s_enters; /* entries to the driver */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -586,6 +587,7 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; + short nobau; int baudisabled; int set_bau_off; short cpu; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 0c48d438cbb..1492170cbb5 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -38,6 +38,7 @@ static int timeout_base_ns[] = { static int timeout_us; static int nobau; +static int nobau_perm; static int baudisabled; static spinlock_t disable_lock; static cycles_t congested_cycles; @@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); +static void +set_bau_on(void) +{ + int cpu; + struct bau_control *bcp; + + if (nobau_perm) { + pr_info("BAU not initialized; cannot be turned on\n"); + return; + } + nobau = 0; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 0; + } + pr_info("BAU turned on\n"); + return; +} + +static void +set_bau_off(void) +{ + int cpu; + struct bau_control *bcp; + + nobau = 1; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 1; + } + pr_info("BAU turned off\n"); + return; +} + /* * Determine the first node on a uvhub. 'Nodes' are used for kernel * memory allocation. @@ -1079,12 +1114,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct ptc_stats *stat; struct bau_control *bcp; - /* kernel was booted 'nobau' */ - if (nobau) - return cpumask; - bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; + stat->s_enters++; + + if (bcp->nobau) + return cpumask; /* bau was disabled due to slow response */ if (bcp->baudisabled) { @@ -1338,29 +1373,32 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec) static int ptc_seq_show(struct seq_file *file, void *data) { struct ptc_stats *stat; + struct bau_control *bcp; int cpu; cpu = *(loff_t *)data; if (!cpu) { seq_printf(file, - "# cpu sent stime self locals remotes ncpus localhub "); + "# cpu bauoff sent stime self locals remotes ncpus localhub "); seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); seq_printf(file, - "resetp resett giveup sto bz throt swack recv rtime "); + "resetp resett giveup sto bz throt enters swack recv rtime "); seq_printf(file, "all one mult none retry canc nocan reset rcan "); seq_printf(file, "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { - stat = &per_cpu(ptcstats, cpu); + bcp = &per_cpu(bau_control, cpu); + stat = bcp->statp; /* source side statistics */ seq_printf(file, - "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->s_requestor, cycles_2_us(stat->s_time), + "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + cpu, bcp->nobau, stat->s_requestor, + cycles_2_us(stat->s_time), stat->s_ntargself, stat->s_ntarglocals, stat->s_ntargremotes, stat->s_ntargcpu, stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, @@ -1369,11 +1407,11 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, stat->s_dtimeout, stat->s_strongnacks); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, - stat->s_busy, stat->s_throttles); + stat->s_busy, stat->s_throttles, stat->s_enters); /* destination side statistics */ seq_printf(file, @@ -1438,6 +1476,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, return -EFAULT; optstr[count - 1] = '\0'; + if (!strcmp(optstr, "on")) { + set_bau_on(); + return count; + } else if (!strcmp(optstr, "off")) { + set_bau_off(); + return count; + } + if (strict_strtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; @@ -1836,6 +1882,8 @@ static void __init init_per_cpu_tunables(void) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; + if (nobau) + bcp->nobau = 1; bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = usec_2_cycles(2*timeout_us); @@ -2069,9 +2117,6 @@ static int __init uv_bau_init(void) if (!is_uv_system()) return 0; - if (nobau) - return 0; - for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); @@ -2091,7 +2136,8 @@ static int __init uv_bau_init(void) enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { - nobau = 1; + set_bau_off(); + nobau_perm = 1; return 0; } -- cgit v1.2.3-70-g09d2 From 8b6e511e51f7e540c8e71022318ee4cc9a4567a7 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 22 Jun 2012 08:14:59 -0500 Subject: x86/uv: Work around UV2 BAU hangs On SGI's UV2 the BAU (Broadcast Assist Unit) driver can hang under a heavy load. To cure this: - Disable the UV2 extended status mode (see UV2_EXT_SHFT), as this mode changes BAU behavior in more ways then just delivering an extra bit of status. Revert status to just two meaningful bits, like UV1. - Use no IPI-style resets on UV2. Just give up the request for whatever the reason it failed and let it be accomplished with the legacy IPI method. - Use no alternate sending descriptor (the former UV2 workaround bcp->using_desc and handle_uv2_busy() stuff). Just disable the use of the BAU for a period of time in favor of the legacy IPI method when the h/w bug leaves a descriptor busy. -- new tunable: giveup_limit determines the threshold at which a hub is so plugged that it should do all requests with the legacy IPI method for a period of time -- generalize disable_for_congestion() (renamed disable_for_period()) for use whenever a hub should avoid using the BAU for a period of time Also: - Fix find_another_by_swack(), which is part of the UV2 bug workaround - Correct and clarify the statistics (new stats s_overipilimit, s_giveuplimit, s_enters, s_ipifordisabled, s_plugged, s_congested) Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120622131459.GC31884@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 28 ++- arch/x86/platform/uv/tlb_uv.c | 387 ++++++++++++++++++--------------------- 2 files changed, 200 insertions(+), 215 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 847c00b721b..a06983cdc12 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -140,6 +140,9 @@ #define IPI_RESET_LIMIT 1 /* after this # consecutive successes, bump up the throttle if it was lowered */ #define COMPLETE_THRESHOLD 5 +/* after this # of giveups (fall back to kernel IPI's) disable the use of + the BAU for a period of time */ +#define GIVEUP_LIMIT 100 #define UV_LB_SUBNODEID 0x10 @@ -166,7 +169,6 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 -#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -175,7 +177,7 @@ microseconds */ #define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ -#define CONGESTED_PERIOD 30 /* time for the bau to be +#define DISABLED_PERIOD 10 /* time for the bau to be disabled, in seconds */ /* see msg_type: */ #define MSG_NOOP 0 @@ -520,7 +522,12 @@ struct ptc_stats { unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ - unsigned long s_enters; /* entries to the driver */ + unsigned long s_overipilimit; /* over the ipi reset limit */ + unsigned long s_giveuplimit; /* disables, over giveup limit*/ + unsigned long s_enters; /* entries to the driver */ + unsigned long s_ipifordisabled; /* fall back to IPI; disabled */ + unsigned long s_plugged; /* plugged by h/w bug*/ + unsigned long s_congested; /* giveup on long wait */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -588,8 +595,7 @@ struct bau_control { int ipi_attempts; int conseccompletes; short nobau; - int baudisabled; - int set_bau_off; + short baudisabled; short cpu; short osnode; short uvhub_cpu; @@ -598,14 +604,16 @@ struct bau_control { short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; - short using_desc; /* an index, like uvhub_cpu */ - unsigned int inuse_map; + short busy; /* all were busy (war) */ unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; cycles_t send_message; + cycles_t period_end; + cycles_t period_time; spinlock_t uvhub_lock; spinlock_t queue_lock; + spinlock_t disable_lock; /* tunables */ int max_concurr; int max_concurr_const; @@ -616,9 +624,9 @@ struct bau_control { int complete_threshold; int cong_response_us; int cong_reps; - int cong_period; - unsigned long clocks_per_100_usec; - cycles_t period_time; + cycles_t disabled_period; + int period_giveups; + int giveup_limit; long period_requests; struct hub_and_pnode *thp; }; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 1492170cbb5..71b5d5a07d7 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2011 Cliff Wickman , SGI. + * (c) 2008-2012 Cliff Wickman , SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -39,8 +39,6 @@ static int timeout_base_ns[] = { static int timeout_us; static int nobau; static int nobau_perm; -static int baudisabled; -static spinlock_t disable_lock; static cycles_t congested_cycles; /* tunables: */ @@ -48,12 +46,13 @@ static int max_concurr = MAX_BAU_CONCURRENT; static int max_concurr_const = MAX_BAU_CONCURRENT; static int plugged_delay = PLUGGED_DELAY; static int plugsb4reset = PLUGSB4RESET; +static int giveup_limit = GIVEUP_LIMIT; static int timeoutsb4reset = TIMEOUTSB4RESET; static int ipi_reset_limit = IPI_RESET_LIMIT; static int complete_threshold = COMPLETE_THRESHOLD; static int congested_respns_us = CONGESTED_RESPONSE_US; static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; +static int disabled_period = DISABLED_PERIOD; static struct tunables tunables[] = { {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ @@ -64,7 +63,8 @@ static struct tunables tunables[] = { {&complete_threshold, COMPLETE_THRESHOLD}, {&congested_respns_us, CONGESTED_RESPONSE_US}, {&congested_reps, CONGESTED_REPS}, - {&congested_period, CONGESTED_PERIOD} + {&disabled_period, DISABLED_PERIOD}, + {&giveup_limit, GIVEUP_LIMIT} }; static struct dentry *tunables_dir; @@ -313,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, * Both sockets dump their completed count total into * the message's count. */ - smaster->socket_acknowledge_count[mdp->msg_slot] = 0; + *sp = 0; asp = (struct atomic_short *)&msg->acknowledge_count; msg_ack_count = atom_asr(socket_ack_count, asp); @@ -526,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, } /* - * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. + * But not currently used. */ static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; - unsigned long descriptor_status2; - descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; - descriptor_status = (descriptor_status << 1) | descriptor_status2; + descriptor_status = + ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; return descriptor_status; } @@ -566,87 +565,11 @@ int normal_busy(struct bau_control *bcp) */ int handle_uv2_busy(struct bau_control *bcp) { - int busy_one = bcp->using_desc; - int normal = bcp->uvhub_cpu; - int selected = -1; - int i; - unsigned long descriptor_status; - unsigned long status; - int mmr_offset; - struct bau_desc *bau_desc_old; - struct bau_desc *bau_desc_new; - struct bau_control *hmaster = bcp->uvhub_master; struct ptc_stats *stat = bcp->statp; - cycles_t ttm; stat->s_uv2_wars++; - spin_lock(&hmaster->uvhub_lock); - /* try for the original first */ - if (busy_one != normal) { - if (!normal_busy(bcp)) - selected = normal; - } - if (selected < 0) { - /* can't use the normal, select an alternate */ - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - descriptor_status = read_lmmr(mmr_offset); - - /* scan available descriptors 32-63 */ - for (i = 0; i < UV_CPUS_PER_AS; i++) { - if ((hmaster->inuse_map & (1 << i)) == 0) { - status = ((descriptor_status >> - (i * UV_ACT_STATUS_SIZE)) & - UV_ACT_STATUS_MASK) << 1; - if (status != UV2H_DESC_BUSY) { - selected = i + UV_CPUS_PER_AS; - break; - } - } - } - } - - if (busy_one != normal) - /* mark the busy alternate as not in-use */ - hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); - - if (selected >= 0) { - /* switch to the selected descriptor */ - if (selected != normal) { - /* set the selected alternate as in-use */ - hmaster->inuse_map |= - (1 << (selected - UV_CPUS_PER_AS)); - if (selected > stat->s_uv2_wars_hw) - stat->s_uv2_wars_hw = selected; - } - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * busy_one); - bcp->using_desc = selected; - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * selected); - *bau_desc_new = *bau_desc_old; - } else { - /* - * All are busy. Wait for the normal one for this cpu to - * free up. - */ - stat->s_uv2_war_waits++; - spin_unlock(&hmaster->uvhub_lock); - ttm = get_cycles(); - do { - cpu_relax(); - } while (normal_busy(bcp)); - spin_lock(&hmaster->uvhub_lock); - /* switch to the original descriptor */ - bcp->using_desc = normal; - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); - bcp->using_desc = (ITEMS_PER_DESC * normal); - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * normal); - *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ - } - spin_unlock(&hmaster->uvhub_lock); - return FLUSH_RETRY_BUSYBUG; + bcp->busy = 1; + return FLUSH_GIVEUP; } static int uv2_wait_completion(struct bau_desc *bau_desc, @@ -655,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, { unsigned long descriptor_stat; cycles_t ttm; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; long busy_reps = 0; struct ptc_stats *stat = bcp->statp; @@ -663,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { - /* - * Our software ack messages may be blocked because - * there are no swack resources available. As long - * as none of them has timed out hardware will NACK - * our message and its state will stay IDLE. - */ - if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || - (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { + if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) { + /* + * A h/w bug on the destination side may + * have prevented the message being marked + * pending, thus it doesn't get replied to + * and gets continually nacked until it times + * out with a SOURCE_TIMEOUT. + */ stat->s_stimeout++; return FLUSH_GIVEUP; - } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { - stat->s_strongnacks++; - bcp->conseccompletes = 0; - return FLUSH_GIVEUP; } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { + ttm = get_cycles(); + + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + * Without using the extended status we have to + * deduce from the short time that this was a + * strong nack. + */ + if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { + bcp->conseccompletes = 0; + stat->s_plugged++; + /* FLUSH_RETRY_PLUGGED causes hang on boot */ + return FLUSH_GIVEUP; + } stat->s_dtimeout++; bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; + /* FLUSH_RETRY_TIMEOUT causes hang on boot */ + return FLUSH_GIVEUP; } else { busy_reps++; if (busy_reps > 1000000) { @@ -688,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, busy_reps = 0; ttm = get_cycles(); if ((ttm - bcp->send_message) > - (bcp->clocks_per_100_usec)) { + bcp->timeout_interval) return handle_uv2_busy(bcp); - } } /* * descriptor_stat is still BUSY @@ -714,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; @@ -793,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc, } /* - * Completions are taking a very long time due to a congested numalink - * network. + * Stop all cpus on a uvhub from using the BAU for a period of time. + * This is reversed by check_enable. */ -static void disable_for_congestion(struct bau_control *bcp, - struct ptc_stats *stat) +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat) { - /* let only one cpu do this disabling */ - spin_lock(&disable_lock); - - if (!baudisabled && bcp->period_requests && - ((bcp->period_time / bcp->period_requests) > congested_cycles)) { - int tcpu; - struct bau_control *tbcp; - /* it becomes this cpu's job to turn on the use of the - BAU again */ - baudisabled = 1; - bcp->set_bau_off = 1; - bcp->set_bau_on_time = get_cycles(); - bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); + int tcpu; + struct bau_control *tbcp; + struct bau_control *hmaster; + cycles_t tm1; + + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (!bcp->baudisabled) { stat->s_bau_disabled++; + tm1 = get_cycles(); for_each_present_cpu(tcpu) { tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 1; + if (tbcp->uvhub_master == hmaster) { + tbcp->baudisabled = 1; + tbcp->set_bau_on_time = + tm1 + bcp->disabled_period; + } } } - - spin_unlock(&disable_lock); + spin_unlock(&hmaster->disable_lock); } static void count_max_concurr(int stat, struct bau_control *bcp, @@ -850,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2, bcp->period_requests++; bcp->period_time += elapsed; if ((elapsed > congested_cycles) && - (bcp->period_requests > bcp->cong_reps)) - disable_for_congestion(bcp, stat); + (bcp->period_requests > bcp->cong_reps) && + ((bcp->period_time / bcp->period_requests) > + congested_cycles)) { + stat->s_congested++; + disable_for_period(bcp, stat); + } } } else stat->s_requestor--; if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) + else if (completion_status == FLUSH_GIVEUP) { stat->s_giveup++; + if (get_cycles() > bcp->period_end) + bcp->period_giveups = 0; + bcp->period_giveups++; + if (bcp->period_giveups == 1) + bcp->period_end = get_cycles() + bcp->disabled_period; + if (bcp->period_giveups > bcp->giveup_limit) { + disable_for_period(bcp, stat); + stat->s_giveuplimit++; + } + } } /* @@ -903,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, + struct bau_desc *bau_desc) { int seq_number = 0; int completion_stat = 0; @@ -916,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_bau_msg_header *uv2_hdr = NULL; - struct bau_desc *bau_desc; - if (bcp->uvhub_version == 1) + if (bcp->uvhub_version == 1) { + uv1 = 1; uv1_throttle(hmaster, stat); + } while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); + if (uv1) + uv1_hdr = &bau_desc->header.uv1_hdr; + else + uv2_hdr = &bau_desc->header.uv2_hdr; + do { - bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); - if (bcp->uvhub_version == 1) { - uv1 = 1; - uv1_hdr = &bau_desc->header.uv1_hdr; - } else - uv2_hdr = &bau_desc->header.uv2_hdr; - if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { + if (try == 0) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else @@ -951,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) uv1_hdr->sequence = seq_number; else uv2_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; + index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); - /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; + stat->s_overipilimit++; completion_stat = FLUSH_GIVEUP; break; } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || - (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -990,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) } /* - * The BAU is disabled. When the disabled time period has expired, the cpu - * that disabled it must re-enable it. - * Return 0 if it is re-enabled for all cpus. + * The BAU is disabled for this uvhub. When the disabled time period has + * expired re-enable it. + * Return 0 if it is re-enabled for all cpus on this uvhub. */ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) { int tcpu; struct bau_control *tbcp; + struct bau_control *hmaster; - if (bcp->set_bau_off) { - if (get_cycles() >= bcp->set_bau_on_time) { - stat->s_bau_reenabled++; - baudisabled = 0; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) { + stat->s_bau_reenabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + if (tbcp->uvhub_master == hmaster) { tbcp->baudisabled = 0; tbcp->period_requests = 0; tbcp->period_time = 0; + tbcp->period_giveups = 0; } - return 0; } + spin_unlock(&hmaster->disable_lock); + return 0; } + spin_unlock(&hmaster->disable_lock); return -1; } @@ -1113,6 +1065,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; + unsigned long descriptor_status; + unsigned long status; bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; @@ -1121,10 +1075,22 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, if (bcp->nobau) return cpumask; + if (bcp->busy) { + descriptor_status = + read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0); + status = ((descriptor_status >> (bcp->uvhub_cpu * + UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1; + if (status == UV2H_DESC_BUSY) + return cpumask; + bcp->busy = 0; + } + /* bau was disabled due to slow response */ if (bcp->baudisabled) { - if (check_enable(bcp, stat)) + if (check_enable(bcp, stat)) { + stat->s_ipifordisabled++; return cpumask; + } } /* @@ -1140,7 +1106,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -1153,25 +1119,27 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc)) return NULL; else return cpumask; } /* - * Search the message queue for any 'other' message with the same software - * acknowledge resource bit vector. + * Search the message queue for any 'other' unprocessed message with the + * same software acknowledge resource bit vector as the 'msg' message. */ struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp, unsigned char swack_vec) + struct bau_control *bcp) { struct bau_pq_entry *msg_next = msg + 1; + unsigned char swack_vec = msg->swack_vec; if (msg_next > bcp->queue_last) msg_next = bcp->queue_first; - while ((msg_next->swack_vec != 0) && (msg_next != msg)) { - if (msg_next->swack_vec == swack_vec) + while (msg_next != msg) { + if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) && + (msg_next->swack_vec == swack_vec)) return msg_next; msg_next++; if (msg_next > bcp->queue_last) @@ -1200,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) * This message was assigned a swack resource, but no * reserved acknowlegment is pending. * The bug has prevented this message from setting the MMR. - * And no other message has used the same sw_ack resource. - * Do the requested shootdown but do not reply to the msg. - * (the 0 means make no acknowledge) */ - bau_process_message(mdp, bcp, 0); - return; - } - - /* - * Some message has set the MMR 'pending' bit; it might have been - * another message. Look for that message. - */ - other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); - if (other_msg) { - /* There is another. Do not ack the current one. */ - bau_process_message(mdp, bcp, 0); /* - * Let the natural processing of that message acknowledge - * it. Don't get the processing of sw_ack's out of order. + * Some message has set the MMR 'pending' bit; it might have + * been another message. Look for that message. */ - return; + other_msg = find_another_by_swack(msg, bcp); + if (other_msg) { + /* + * There is another. Process this one but do not + * ack it. + */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that other message + * acknowledge it. Don't get the processing of sw_ack's + * out of order. + */ + return; + } } /* - * There is no other message using this sw_ack, so it is safe to - * acknowledge it. + * Either the MMR shows this one pending a reply or there is no + * other message using this sw_ack, so it is safe to acknowledge it. */ bau_process_message(mdp, bcp, 1); @@ -1330,7 +1296,8 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image |= (1L << UV2_EXT_SHFT); + /* hw bug workaround; do not use extended status */ + mmr_image &= ~(1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); } @@ -1379,24 +1346,26 @@ static int ptc_seq_show(struct seq_file *file, void *data) cpu = *(loff_t *)data; if (!cpu) { seq_printf(file, - "# cpu bauoff sent stime self locals remotes ncpus localhub "); + "# cpu bauoff sent stime self locals remotes ncpus localhub "); seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); + seq_printf(file, + "rok resetp resett giveup sto bz throt disable "); seq_printf(file, - "resetp resett giveup sto bz throt enters swack recv rtime "); + "enable wars warshw warwaits enters ipidis plugged "); seq_printf(file, - "all one mult none retry canc nocan reset rcan "); + "ipiover glim cong swack recv rtime all one mult "); seq_printf(file, - "disable enable wars warshw warwaits\n"); + "none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; /* source side statistics */ seq_printf(file, - "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", cpu, bcp->nobau, stat->s_requestor, cycles_2_us(stat->s_time), stat->s_ntargself, stat->s_ntarglocals, @@ -1407,25 +1376,28 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, stat->s_dtimeout, stat->s_strongnacks); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld ", + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, - stat->s_busy, stat->s_throttles, stat->s_enters); + stat->s_busy, stat->s_throttles); + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits, stat->s_enters, + stat->s_ipifordisabled, stat->s_plugged, + stat->s_overipilimit, stat->s_giveuplimit, + stat->s_congested); /* destination side statistics */ seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), stat->d_requestee, cycles_2_us(stat->d_time), stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld %ld %ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled, - stat->s_uv2_wars, stat->s_uv2_wars_hw, - stat->s_uv2_war_waits); } return 0; } @@ -1439,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf, char *buf; int ret; - buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", - "max_concur plugged_delay plugsb4reset", - "timeoutsb4reset ipi_reset_limit complete_threshold", - "congested_response_us congested_reps congested_period", + buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n", + "max_concur plugged_delay plugsb4reset timeoutsb4reset", + "ipi_reset_limit complete_threshold congested_response_us", + "congested_reps disabled_period giveup_limit", max_concurr, plugged_delay, plugsb4reset, timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_respns_us, congested_reps, congested_period); + congested_respns_us, congested_reps, disabled_period, + giveup_limit); if (!buf) return -ENOMEM; @@ -1616,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user, bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; } return count; } @@ -1745,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) * fairness chaining multilevel count replied_to */ } else { + /* + * BIOS uses legacy mode, but UV2 hardware always + * uses native mode for selective broadcasts. + */ uv2_hdr = &bd2->header.uv2_hdr; uv2_hdr->swack_flag = 1; uv2_hdr->base_dest_nasid = @@ -1896,10 +1874,11 @@ static void __init init_per_cpu_tunables(void) bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; - bcp->clocks_per_100_usec = usec_2_cycles(100); + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; spin_lock_init(&bcp->queue_lock); spin_lock_init(&bcp->uvhub_lock); + spin_lock_init(&bcp->disable_lock); } } @@ -2020,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; - bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); @@ -2123,7 +2101,6 @@ static int __init uv_bau_init(void) } nuvhubs = uv_num_possible_blades(); - spin_lock_init(&disable_lock); congested_cycles = usec_2_cycles(congested_respns_us); uv_base_pnode = 0x7fffffff; -- cgit v1.2.3-70-g09d2 From ffaf9156320a077ebf9c5b9a5cf987689dc1a6b3 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:06:58 +0300 Subject: crypto: ablk_helper - move ablk_* functions from serpent-sse2/avx glue code to shared module Move ablk-* functions to separate module to share common code between cipher implementations. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/ablk_helper.c | 150 ++++++++++++++++++++++++++++++ arch/x86/crypto/serpent_avx_glue.c | 115 ++--------------------- arch/x86/crypto/serpent_sse2_glue.c | 115 ++--------------------- arch/x86/include/asm/crypto/ablk_helper.h | 29 ++++++ crypto/Kconfig | 8 ++ 6 files changed, 201 insertions(+), 218 deletions(-) create mode 100644 arch/x86/crypto/ablk_helper.c create mode 100644 arch/x86/include/asm/crypto/ablk_helper.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 83caa4b948c..ad746916f91 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,8 @@ # Arch-specific CryptoAPI modules. # +obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o + obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c new file mode 100644 index 00000000000..284ca3bc238 --- /dev/null +++ b/arch/x86/crypto/ablk_helper.c @@ -0,0 +1,150 @@ +/* + * Shared async block cipher helpers + * + * Copyright (c) 2012 Jussi Kivilinna + * + * Based on aesni-intel_glue.c by: + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; +} +EXPORT_SYMBOL_GPL(ablk_set_key); + +int __ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->encrypt( + &desc, req->dst, req->src, req->nbytes); +} +EXPORT_SYMBOL_GPL(__ablk_encrypt); + +int ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_encrypt(cryptd_req); + } else { + return __ablk_encrypt(req); + } +} +EXPORT_SYMBOL_GPL(ablk_encrypt); + +int ablk_decrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_decrypt(cryptd_req); + } else { + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->decrypt( + &desc, req->dst, req->src, req->nbytes); + } +} +EXPORT_SYMBOL_GPL(ablk_decrypt); + +void ablk_exit(struct crypto_tfm *tfm) +{ + struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ablkcipher(ctx->cryptd_tfm); +} +EXPORT_SYMBOL_GPL(ablk_exit); + +void ablk_init_common(struct crypto_tfm *tfm, + struct cryptd_ablkcipher *cryptd_tfm) +{ + struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); +} +EXPORT_SYMBOL_GPL(ablk_init_common); + +int ablk_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + char drv_name[CRYPTO_MAX_ALG_NAME]; + + snprintf(drv_name, sizeof(drv_name), "__driver-%s", + crypto_tfm_alg_driver_name(tfm)); + + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + ablk_init_common(tfm, cryptd_tfm); + + return 0; +} +EXPORT_SYMBOL_GPL(ablk_init); + +MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index dd81bab4f11..31eb567cc89 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -40,14 +40,11 @@ #include #include #include +#include #include #include #include -struct async_serpent_ctx { - struct cryptd_ablkcipher *cryptd_tfm; -}; - static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) { if (fpu_enabled) @@ -593,106 +590,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) -{ - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; - int err; - - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); - return err; -} - -static int __ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - return __ablk_encrypt(req); - } -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ - struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init(struct crypto_tfm *tfm) -{ - struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - struct cryptd_ablkcipher *cryptd_tfm; - char drv_name[CRYPTO_MAX_ALG_NAME]; - - snprintf(drv_name, sizeof(drv_name), "__driver-%s", - crypto_tfm_alg_driver_name(tfm)); - - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); - - return 0; -} - static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ecb-serpent-avx", .cra_driver_name = "__driver-ecb-serpent-avx", @@ -805,7 +702,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 500, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -827,7 +724,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 500, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -850,7 +747,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 500, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -874,7 +771,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 500, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -899,7 +796,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 500, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index deecd25c129..805c91fda7a 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -43,14 +43,11 @@ #include #include #include +#include #include #include #include -struct async_serpent_ctx { - struct cryptd_ablkcipher *cryptd_tfm; -}; - static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) { if (fpu_enabled) @@ -596,106 +593,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) -{ - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; - int err; - - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); - return err; -} - -static int __ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - return __ablk_encrypt(req); - } -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ - struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init(struct crypto_tfm *tfm) -{ - struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - struct cryptd_ablkcipher *cryptd_tfm; - char drv_name[CRYPTO_MAX_ALG_NAME]; - - snprintf(drv_name, sizeof(drv_name), "__driver-%s", - crypto_tfm_alg_driver_name(tfm)); - - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); - - return 0; -} - static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ecb-serpent-sse2", .cra_driver_name = "__driver-ecb-serpent-sse2", @@ -808,7 +705,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -830,7 +727,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -853,7 +750,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -877,7 +774,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -902,7 +799,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h new file mode 100644 index 00000000000..6d6b37c6f5d --- /dev/null +++ b/arch/x86/include/asm/crypto/ablk_helper.h @@ -0,0 +1,29 @@ +/* + * Shared async block cipher helpers + */ + +#ifndef _CRYPTO_ABLK_HELPER_H +#define _CRYPTO_ABLK_HELPER_H + +#include +#include +#include + +struct async_helper_ctx { + struct cryptd_ablkcipher *cryptd_tfm; +}; + +extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len); + +extern int __ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_decrypt(struct ablkcipher_request *req); + +extern void ablk_exit(struct crypto_tfm *tfm); + +extern int ablk_init(struct crypto_tfm *tfm); + +#endif /* _CRYPTO_ABLK_HELPER_H */ diff --git a/crypto/Kconfig b/crypto/Kconfig index 2c1c2dfcc02..8e9145c07d8 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -174,6 +174,11 @@ config CRYPTO_TEST help Quick & dirty crypto test module. +config CRYPTO_ABLK_HELPER_X86 + tristate + depends on X86 + select CRYPTO_CRYPTD + comment "Authenticated Encryption with Associated Data" config CRYPTO_CCM @@ -786,6 +791,7 @@ config CRYPTO_SERPENT_SSE2_X86_64 depends on X86 && 64BIT select CRYPTO_ALGAPI select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 select CRYPTO_SERPENT select CRYPTO_LRW select CRYPTO_XTS @@ -806,6 +812,7 @@ config CRYPTO_SERPENT_SSE2_586 depends on X86 && !64BIT select CRYPTO_ALGAPI select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 select CRYPTO_SERPENT select CRYPTO_LRW select CRYPTO_XTS @@ -826,6 +833,7 @@ config CRYPTO_SERPENT_AVX_X86_64 depends on X86 && 64BIT select CRYPTO_ALGAPI select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 select CRYPTO_SERPENT select CRYPTO_LRW select CRYPTO_XTS -- cgit v1.2.3-70-g09d2 From 30a04008827b58c4aafbd1d6a27d6b6ed239e993 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:03 +0300 Subject: crypto: twofish-avx - change to use shared ablk_* functions Remove duplicate ablk_* functions and make use of ablk_helper module instead. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_avx_glue.c | 116 ++----------------------------------- crypto/Kconfig | 1 + 2 files changed, 7 insertions(+), 110 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 599f19e4bef..cabe058eba1 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -95,11 +96,6 @@ static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, } - -struct async_twofish_ctx { - struct cryptd_ablkcipher *cryptd_tfm; -}; - static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) { if (fpu_enabled) @@ -730,106 +726,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) -{ - struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; - int err; - - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); - return err; -} - -static int __ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - return __ablk_encrypt(req); - } -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_twofish_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ - struct async_twofish_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init(struct crypto_tfm *tfm) -{ - struct async_twofish_ctx *ctx = crypto_tfm_ctx(tfm); - struct cryptd_ablkcipher *cryptd_tfm; - char drv_name[CRYPTO_MAX_ALG_NAME]; - - snprintf(drv_name, sizeof(drv_name), "__driver-%s", - crypto_tfm_alg_driver_name(tfm)); - - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); - - return 0; -} - static struct crypto_alg twofish_algs[10] = { { .cra_name = "__ecb-twofish-avx", .cra_driver_name = "__driver-ecb-twofish-avx", @@ -942,7 +838,7 @@ static struct crypto_alg twofish_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -964,7 +860,7 @@ static struct crypto_alg twofish_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -987,7 +883,7 @@ static struct crypto_alg twofish_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1011,7 +907,7 @@ static struct crypto_alg twofish_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1036,7 +932,7 @@ static struct crypto_alg twofish_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_twofish_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, diff --git a/crypto/Kconfig b/crypto/Kconfig index 8e9145c07d8..24b929e30bf 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -946,6 +946,7 @@ config CRYPTO_TWOFISH_AVX_X86_64 depends on X86 && 64BIT select CRYPTO_ALGAPI select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 select CRYPTO_TWOFISH_X86_64_3WAY -- cgit v1.2.3-70-g09d2 From a9629d7142ea22567eaa999232d8a31a7493665a Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:08 +0300 Subject: crypto: aes_ni - change to use shared ablk_* functions Remove duplicate ablk_* functions and make use of ablk_helper module instead. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/ablk_helper.c | 19 +++--- arch/x86/crypto/aesni-intel_glue.c | 100 +++--------------------------- arch/x86/include/asm/crypto/ablk_helper.h | 2 + crypto/Kconfig | 1 + 4 files changed, 20 insertions(+), 102 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c index 284ca3bc238..43282fe04a8 100644 --- a/arch/x86/crypto/ablk_helper.c +++ b/arch/x86/crypto/ablk_helper.c @@ -118,32 +118,31 @@ void ablk_exit(struct crypto_tfm *tfm) } EXPORT_SYMBOL_GPL(ablk_exit); -void ablk_init_common(struct crypto_tfm *tfm, - struct cryptd_ablkcipher *cryptd_tfm) +int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) { struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + crypto_ablkcipher_reqsize(&cryptd_tfm->base); + + return 0; } EXPORT_SYMBOL_GPL(ablk_init_common); int ablk_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; char drv_name[CRYPTO_MAX_ALG_NAME]; snprintf(drv_name, sizeof(drv_name), "__driver-%s", crypto_tfm_alg_driver_name(tfm)); - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ablk_init_common(tfm, cryptd_tfm); - - return 0; + return ablk_init_common(tfm, drv_name); } EXPORT_SYMBOL_GPL(ablk_init); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ac7f5cd019e..7c9d54d8dc4 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -52,10 +53,6 @@ #define HAS_XTS #endif -struct async_aes_ctx { - struct cryptd_ablkcipher *cryptd_tfm; -}; - /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. @@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc, } #endif -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) -{ - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; - int err; - - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); - return err; -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ - struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) -{ - struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); - - return 0; -} - static int ablk_ecb_init(struct crypto_tfm *tfm) { return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); @@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h index 6d6b37c6f5d..4f93df50c23 100644 --- a/arch/x86/include/asm/crypto/ablk_helper.h +++ b/arch/x86/include/asm/crypto/ablk_helper.h @@ -24,6 +24,8 @@ extern int ablk_decrypt(struct ablkcipher_request *req); extern void ablk_exit(struct crypto_tfm *tfm); +extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name); + extern int ablk_init(struct crypto_tfm *tfm); #endif /* _CRYPTO_ABLK_HELPER_H */ diff --git a/crypto/Kconfig b/crypto/Kconfig index 24b929e30bf..471cc468e18 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -557,6 +557,7 @@ config CRYPTO_AES_NI_INTEL select CRYPTO_AES_X86_64 if 64BIT select CRYPTO_AES_586 if !64BIT select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 select CRYPTO_ALGAPI help Use Intel AES-NI instructions for AES algorithm. -- cgit v1.2.3-70-g09d2 From e81792fbc2a6fa4969f724b959829667fb2d4f01 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:14 +0300 Subject: crypto: serpent-sse2 - prepare serpent-sse2 glue code into generic x86 glue code for 128bit block ciphers Block cipher implementations in arch/x86/crypto/ contain common glue code that is currently duplicated in each module (camellia-x86_64, twofish-x86_64-3way, twofish-avx, serpent-sse2 and serpent-avx). This patch prepares serpent-sse2 glue into generic glue code for all 128bit block ciphers to use in arch/x86/crypto. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 466 +++++++++++++++++++++++------------- 1 file changed, 303 insertions(+), 163 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 805c91fda7a..8c86239010a 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -48,105 +48,129 @@ #include #include -static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); +typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); +typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, + u128 *iv); + +#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) +#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) +#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) + +struct common_glue_func_entry { + unsigned int num_blocks; /* number of blocks that @fn will process */ + union { + common_glue_func_t ecb; + common_glue_cbc_func_t cbc; + common_glue_ctr_func_t ctr; + } fn_u; +}; + +struct common_glue_ctx { + unsigned int num_funcs; + int fpu_blocks_limit; /* -1 means fpu not needed at all */ + + /* + * First funcs entry must have largest num_blocks and last funcs entry + * must have num_blocks == 1! + */ + struct common_glue_func_entry funcs[]; +}; + +static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, + struct blkcipher_desc *desc, + bool fpu_enabled, unsigned int nbytes) { + if (likely(fpu_blocks_limit < 0)) + return false; + if (fpu_enabled) return true; - /* SSE2 is only used when chunk to be processed is large enough, so - * do not enable FPU until it is necessary. + /* + * Vector-registers are only used when chunk to be processed is large + * enough, so do not enable FPU until it is necessary. */ - if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS) + if (nbytes < bsize * (unsigned int)fpu_blocks_limit) return false; + if (desc) { + /* prevent sleeping if FPU is in use */ + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + } + kernel_fpu_begin(); return true; } -static inline void serpent_fpu_end(bool fpu_enabled) +static inline void glue_fpu_end(bool fpu_enabled) { if (fpu_enabled) kernel_fpu_end(); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) +static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) { + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes, i, func_bytes; bool fpu_enabled = false; - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes; int err; err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; while ((nbytes = walk->nbytes)) { u8 *wsrc = walk->src.virt.addr; u8 *wdst = walk->dst.virt.addr; - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - if (enc) - serpent_enc_blk_xway(ctx, wdst, wsrc); - else - serpent_dec_blk_xway(ctx, wdst, wsrc); + for (i = 0; i < gctx->num_funcs; i++) { + func_bytes = bsize * gctx->funcs[i].num_blocks; - wsrc += bsize * SERPENT_PARALLEL_BLOCKS; - wdst += bsize * SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ecb(ctx, wdst, + wsrc); - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (enc) - __serpent_encrypt(ctx, wdst, wsrc); - else - __serpent_decrypt(ctx, wdst, wsrc); + wsrc += func_bytes; + wdst += func_bytes; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); + if (nbytes < bsize) + goto done; + } + } done: err = blkcipher_walk_done(desc, walk, nbytes); } - serpent_fpu_end(fpu_enabled); + glue_fpu_end(fpu_enabled); return err; } -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); + return __glue_ecb_crypt_128bit(gctx, desc, &walk); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); -} - -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; unsigned int nbytes = walk->nbytes; u128 *src = (u128 *)walk->src.virt.addr; u128 *dst = (u128 *)walk->dst.virt.addr; @@ -154,7 +178,7 @@ static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, do { u128_xor(dst, src, iv); - __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst); + fn(ctx, (u8 *)dst, (u8 *)dst); iv = dst; src += 1; @@ -166,8 +190,10 @@ static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, return nbytes; } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +int glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { struct blkcipher_walk walk; int err; @@ -176,24 +202,26 @@ static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, err = blkcipher_walk_virt(desc, &walk); while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); + nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk); err = blkcipher_walk_done(desc, &walk, nbytes); } return err; } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int +__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; unsigned int nbytes = walk->nbytes; u128 *src = (u128 *)walk->src.virt.addr; u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; u128 last_iv; - int i; + unsigned int num_blocks, func_bytes; + unsigned int i; /* Start of the last block. */ src += nbytes / bsize - 1; @@ -201,45 +229,31 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, last_iv = *src; - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1); - src -= SERPENT_PARALLEL_BLOCKS - 1; - dst -= SERPENT_PARALLEL_BLOCKS - 1; + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; - for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + nbytes -= func_bytes - bsize; + src -= num_blocks - 1; + dst -= num_blocks - 1; + + gctx->funcs[i].fn_u.cbc(ctx, dst, src); - serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + nbytes -= bsize; + if (nbytes < bsize) + goto done; - for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) - u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= func_bytes); - nbytes -= bsize; if (nbytes < bsize) goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; + } } done: @@ -249,24 +263,27 @@ done: return nbytes; } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { + const unsigned int bsize = 128 / 8; bool fpu_enabled = false; struct blkcipher_walk walk; int err; blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; while ((nbytes = walk.nbytes)) { - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); err = blkcipher_walk_done(desc, &walk, nbytes); } - serpent_fpu_end(fpu_enabled); + glue_fpu_end(fpu_enabled); return err; } @@ -289,109 +306,232 @@ static inline void u128_inc(u128 *i) i->a++; } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[SERPENT_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; + void *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *src = (u8 *)walk->src.virt.addr; + u8 *dst = (u8 *)walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; + u128 ctrblk; + u128 tmp; - __serpent_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + be128_to_u128(&ctrblk, (be128 *)walk->iv); - crypto_inc(ctrblk, SERPENT_BLOCK_SIZE); + memcpy(&tmp, src, nbytes); + fn_ctr(ctx, &tmp, &tmp, &ctrblk); + memcpy(dst, &tmp, nbytes); + + u128_to_be128((be128 *)walk->iv, &ctrblk); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; + const unsigned int bsize = 128 / 8; + void *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int nbytes = walk->nbytes; u128 *src = (u128 *)walk->src.virt.addr; u128 *dst = (u128 *)walk->dst.virt.addr; u128 ctrblk; - be128 ctrblocks[SERPENT_PARALLEL_BLOCKS]; - int i; + unsigned int num_blocks, func_bytes; + unsigned int i; be128_to_u128(&ctrblk, (be128 *)walk->iv); /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - u128_to_be128(&ctrblocks[i], &ctrblk); - u128_inc(&ctrblk); - } + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; - serpent_enc_blk_xway_xor(ctx, (u8 *)dst, - (u8 *)ctrblocks); + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); - src += SERPENT_PARALLEL_BLOCKS; - dst += SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); - if (nbytes < bsize) - goto done; + if (nbytes < bsize) + goto done; + } } - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - u128_xor(dst, dst, (u128 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - done: u128_to_be128((be128 *)walk->iv, &ctrblk); return nbytes; } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { + const unsigned int bsize = 128 / 8; bool fpu_enabled = false; struct blkcipher_walk walk; int err; blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = blkcipher_walk_virt_block(desc, &walk, bsize); - while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) { - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); + while ((nbytes = walk.nbytes) >= bsize) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); err = blkcipher_walk_done(desc, &walk, nbytes); } - serpent_fpu_end(fpu_enabled); + glue_fpu_end(fpu_enabled); if (walk.nbytes) { - ctr_crypt_final(desc, &walk); + glue_ctr_crypt_final_128bit( + gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); err = blkcipher_walk_done(desc, &walk, 0); } return err; } +static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) +{ + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + unsigned int j; + + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; + + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +} + +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +{ + be128 ctrblk; + + u128_to_be128(&ctrblk, iv); + u128_inc(iv); + + __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, src, (u128 *)&ctrblk); +} + +static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) +{ + be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; + unsigned int i; + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); + } + + serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); +} + +static const struct common_glue_ctx serpent_enc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + } } +}; + +static const struct common_glue_ctx serpent_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx serpent_dec = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + } } +}; + +static const struct common_glue_ctx serpent_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); +} + +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, + NULL, fpu_enabled, nbytes); +} + +static inline void serpent_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + struct crypt_priv { struct serpent_ctx *ctx; bool fpu_enabled; -- cgit v1.2.3-70-g09d2 From 596d875052dac6bf084f0c3a3e946fb4709b727b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:19 +0300 Subject: crypto: serpent-sse2 - split generic glue code to new helper module Now that serpent-sse2 glue code has been made generic, it can be split to separate module. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 1 + arch/x86/crypto/glue_helper.c | 307 ++++++++++++++++++++++++++ arch/x86/crypto/serpent_sse2_glue.c | 352 +----------------------------- arch/x86/include/asm/crypto/glue_helper.h | 115 ++++++++++ crypto/Kconfig | 7 + 5 files changed, 431 insertions(+), 351 deletions(-) create mode 100644 arch/x86/crypto/glue_helper.c create mode 100644 arch/x86/include/asm/crypto/glue_helper.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ad746916f91..e908e5de82d 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o +obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c new file mode 100644 index 00000000000..4854f0f31e4 --- /dev/null +++ b/arch/x86/crypto/glue_helper.c @@ -0,0 +1,307 @@ +/* + * Shared glue code for 128bit block ciphers + * + * Copyright (c) 2012 Jussi Kivilinna + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include + +static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes, i, func_bytes; + bool fpu_enabled = false; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + + for (i = 0; i < gctx->num_funcs; i++) { + func_bytes = bsize * gctx->funcs[i].num_blocks; + + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ecb(ctx, wdst, + wsrc); + + wsrc += func_bytes; + wdst += func_bytes; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + return err; +} + +int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return __glue_ecb_crypt_128bit(gctx, desc, &walk); +} +EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit); + +static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + fn(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +int glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit); + +static unsigned int +__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 last_iv; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + nbytes -= func_bytes - bsize; + src -= num_blocks - 1; + dst -= num_blocks - 1; + + gctx->funcs[i].fn_u.cbc(ctx, dst, src); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); + +static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *src = (u8 *)walk->src.virt.addr; + u8 *dst = (u8 *)walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + u128 ctrblk; + u128 tmp; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + memcpy(&tmp, src, nbytes); + fn_ctr(ctx, &tmp, &tmp, &ctrblk); + memcpy(dst, &tmp, nbytes); + + u128_to_be128((be128 *)walk->iv, &ctrblk); +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); + +static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + void *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + unsigned int num_blocks, func_bytes; + unsigned int i; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, bsize); + + while ((nbytes = walk.nbytes) >= bsize) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + + if (walk.nbytes) { + glue_ctr_crypt_final_128bit( + gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); + +MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 8c86239010a..49a32eedf0c 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -41,359 +41,9 @@ #include #include #include -#include #include #include -#include -#include -#include - -typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); -typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); -typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, - u128 *iv); - -#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) -#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) -#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) - -struct common_glue_func_entry { - unsigned int num_blocks; /* number of blocks that @fn will process */ - union { - common_glue_func_t ecb; - common_glue_cbc_func_t cbc; - common_glue_ctr_func_t ctr; - } fn_u; -}; - -struct common_glue_ctx { - unsigned int num_funcs; - int fpu_blocks_limit; /* -1 means fpu not needed at all */ - - /* - * First funcs entry must have largest num_blocks and last funcs entry - * must have num_blocks == 1! - */ - struct common_glue_func_entry funcs[]; -}; - -static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, - struct blkcipher_desc *desc, - bool fpu_enabled, unsigned int nbytes) -{ - if (likely(fpu_blocks_limit < 0)) - return false; - - if (fpu_enabled) - return true; - - /* - * Vector-registers are only used when chunk to be processed is large - * enough, so do not enable FPU until it is necessary. - */ - if (nbytes < bsize * (unsigned int)fpu_blocks_limit) - return false; - - if (desc) { - /* prevent sleeping if FPU is in use */ - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - } - - kernel_fpu_begin(); - return true; -} - -static inline void glue_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); -} - -static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - void *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = 128 / 8; - unsigned int nbytes, i, func_bytes; - bool fpu_enabled = false; - int err; - - err = blkcipher_walk_virt(desc, walk); - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); - - for (i = 0; i < gctx->num_funcs; i++) { - func_bytes = bsize * gctx->funcs[i].num_blocks; - - /* Process multi-block batch */ - if (nbytes >= func_bytes) { - do { - gctx->funcs[i].fn_u.ecb(ctx, wdst, - wsrc); - - wsrc += func_bytes; - wdst += func_bytes; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; - } - } - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} - -int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return __glue_ecb_crypt_128bit(gctx, desc, &walk); -} - -static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - void *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - fn(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; -} - -int glue_cbc_encrypt_128bit(const common_glue_func_t fn, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - return err; -} - -static unsigned int -__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - void *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 last_iv; - unsigned int num_blocks, func_bytes; - unsigned int i; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - /* Process multi-block batch */ - if (nbytes >= func_bytes) { - do { - nbytes -= func_bytes - bsize; - src -= num_blocks - 1; - dst -= num_blocks - 1; - - gctx->funcs[i].fn_u.cbc(ctx, dst, src); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; - } - } - -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; - - return nbytes; -} - -int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); - nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} - -static inline void u128_to_be128(be128 *dst, const u128 *src) -{ - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); -} - -static inline void be128_to_u128(u128 *dst, const be128 *src) -{ - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); -} - -static inline void u128_inc(u128 *i) -{ - i->b++; - if (!i->b) - i->a++; -} - -static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - void *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *src = (u8 *)walk->src.virt.addr; - u8 *dst = (u8 *)walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - u128 ctrblk; - u128 tmp; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - memcpy(&tmp, src, nbytes); - fn_ctr(ctx, &tmp, &tmp, &ctrblk); - memcpy(dst, &tmp, nbytes); - - u128_to_be128((be128 *)walk->iv, &ctrblk); -} - -static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - const unsigned int bsize = 128 / 8; - void *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - unsigned int num_blocks, func_bytes; - unsigned int i; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process multi-block batch */ - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes >= func_bytes) { - do { - gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); - - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; - } - } - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; -} - -int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, bsize); - - while ((nbytes = walk.nbytes) >= bsize) { - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); - nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - - if (walk.nbytes) { - glue_ctr_crypt_final_128bit( - gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; -} +#include static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) { diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h new file mode 100644 index 00000000000..3e408bddc96 --- /dev/null +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -0,0 +1,115 @@ +/* + * Shared glue code for 128bit block ciphers + */ + +#ifndef _CRYPTO_GLUE_HELPER_H +#define _CRYPTO_GLUE_HELPER_H + +#include +#include +#include +#include + +typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); +typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); +typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, + u128 *iv); + +#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) +#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) +#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) + +struct common_glue_func_entry { + unsigned int num_blocks; /* number of blocks that @fn will process */ + union { + common_glue_func_t ecb; + common_glue_cbc_func_t cbc; + common_glue_ctr_func_t ctr; + } fn_u; +}; + +struct common_glue_ctx { + unsigned int num_funcs; + int fpu_blocks_limit; /* -1 means fpu not needed at all */ + + /* + * First funcs entry must have largest num_blocks and last funcs entry + * must have num_blocks == 1! + */ + struct common_glue_func_entry funcs[]; +}; + +static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, + struct blkcipher_desc *desc, + bool fpu_enabled, unsigned int nbytes) +{ + if (likely(fpu_blocks_limit < 0)) + return false; + + if (fpu_enabled) + return true; + + /* + * Vector-registers are only used when chunk to be processed is large + * enough, so do not enable FPU until it is necessary. + */ + if (nbytes < bsize * (unsigned int)fpu_blocks_limit) + return false; + + if (desc) { + /* prevent sleeping if FPU is in use */ + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + } + + kernel_fpu_begin(); + return true; +} + +static inline void glue_fpu_end(bool fpu_enabled) +{ + if (fpu_enabled) + kernel_fpu_end(); +} + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes); + +extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, + unsigned int nbytes); + +extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, + unsigned int nbytes); + +extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes); + +#endif /* _CRYPTO_GLUE_HELPER_H */ diff --git a/crypto/Kconfig b/crypto/Kconfig index 471cc468e18..92b46970c85 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -179,6 +179,11 @@ config CRYPTO_ABLK_HELPER_X86 depends on X86 select CRYPTO_CRYPTD +config CRYPTO_GLUE_HELPER_X86 + tristate + depends on X86 + select CRYPTO_ALGAPI + comment "Authenticated Encryption with Associated Data" config CRYPTO_CCM @@ -793,6 +798,7 @@ config CRYPTO_SERPENT_SSE2_X86_64 select CRYPTO_ALGAPI select CRYPTO_CRYPTD select CRYPTO_ABLK_HELPER_X86 + select CRYPTO_GLUE_HELPER_X86 select CRYPTO_SERPENT select CRYPTO_LRW select CRYPTO_XTS @@ -814,6 +820,7 @@ config CRYPTO_SERPENT_SSE2_586 select CRYPTO_ALGAPI select CRYPTO_CRYPTD select CRYPTO_ABLK_HELPER_X86 + select CRYPTO_GLUE_HELPER_X86 select CRYPTO_SERPENT select CRYPTO_LRW select CRYPTO_XTS -- cgit v1.2.3-70-g09d2 From 1d0debbd4671a8d302a11837a126d5f87db16bdc Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:24 +0300 Subject: crypto: serpent-avx: remove duplicated glue code and use shared glue code from glue_helper Now that shared glue code is available, convert serpent-avx to use it. Cc: Johannes Goetzfried Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_avx_glue.c | 398 +++++++++---------------------------- crypto/Kconfig | 1 + 2 files changed, 95 insertions(+), 304 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 31eb567cc89..c61b91aa42a 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -36,357 +36,147 @@ #include #include #include -#include #include #include #include #include -#include -#include -#include +#include -static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) { - if (fpu_enabled) - return true; - - /* AVX is only used when chunk to be processed is large enough, so - * do not enable FPU until it is necessary. - */ - if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS) - return false; - - kernel_fpu_begin(); - return true; -} - -static inline void serpent_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); -} - -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) -{ - bool fpu_enabled = false; - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes; - int err; + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + unsigned int j; - err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - if (enc) - serpent_enc_blk_xway(ctx, wdst, wsrc); - else - serpent_dec_blk_xway(ctx, wdst, wsrc); - - wsrc += bsize * SERPENT_PARALLEL_BLOCKS; - wdst += bsize * SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (enc) - __serpent_encrypt(ctx, wdst, wsrc); - else - __serpent_decrypt(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - serpent_fpu_end(fpu_enabled); - return err; + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); } -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); -} + be128 ctrblk; -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; + u128_to_be128(&ctrblk, iv); + u128_inc(iv); - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); + __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, src, (u128 *)&ctrblk); } -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; -} + be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; + unsigned int i; -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); } - return err; + serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; - u128 last_iv; - int i; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1); - src -= SERPENT_PARALLEL_BLOCKS - 1; - dst -= SERPENT_PARALLEL_BLOCKS - 1; - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; - - serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) - u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; +static const struct common_glue_ctx serpent_enc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + } } +}; - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } +static const struct common_glue_ctx serpent_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } + } } +}; -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; +static const struct common_glue_ctx serpent_dec = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + } } +}; - return nbytes; -} +static const struct common_glue_ctx serpent_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + } } +}; -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes)) { - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - serpent_fpu_end(fpu_enabled); - return err; + return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); } -static inline void u128_to_be128(be128 *dst, const u128 *src) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); + return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); } -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, + dst, src, nbytes); } -static inline void u128_inc(u128 *i) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - i->b++; - if (!i->b) - i->a++; + return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, + nbytes); } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[SERPENT_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - __serpent_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, SERPENT_BLOCK_SIZE); + return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - be128 ctrblocks[SERPENT_PARALLEL_BLOCKS]; - int i; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - u128_to_be128(&ctrblocks[i], &ctrblk); - u128_inc(&ctrblk); - } - - serpent_enc_blk_xway_xor(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += SERPENT_PARALLEL_BLOCKS; - dst += SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - u128_xor(dst, dst, (u128 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; + return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, + NULL, fpu_enabled, nbytes); } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static inline void serpent_fpu_end(bool fpu_enabled) { - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) { - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - serpent_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; + glue_fpu_end(fpu_enabled); } struct crypt_priv { diff --git a/crypto/Kconfig b/crypto/Kconfig index 92b46970c85..a86c2fb92ea 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -842,6 +842,7 @@ config CRYPTO_SERPENT_AVX_X86_64 select CRYPTO_ALGAPI select CRYPTO_CRYPTD select CRYPTO_ABLK_HELPER_X86 + select CRYPTO_GLUE_HELPER_X86 select CRYPTO_SERPENT select CRYPTO_LRW select CRYPTO_XTS -- cgit v1.2.3-70-g09d2 From 964263afdcbf9d1e85c021acfff0cc68dd168475 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:29 +0300 Subject: crypto: camellia-x86_64 - remove duplicated glue code and use shared glue code from glue_helper Now that shared glue code is available, convert camellia-x86_64 to use it. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia_glue.c | 355 ++++++++++------------------------------ crypto/Kconfig | 1 + 2 files changed, 87 insertions(+), 269 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 3306dc0b139..eeb2b3b743e 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -5,10 +5,6 @@ * * Camellia parts based on code by: * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,9 +30,9 @@ #include #include #include -#include #include #include +#include #define CAMELLIA_MIN_KEY_SIZE 16 #define CAMELLIA_MAX_KEY_SIZE 32 @@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, &tfm->crt_flags); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - void (*fn)(struct camellia_ctx *, u8 *, const u8 *), - void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *)) +static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - /* Process two block batch */ - if (nbytes >= bsize * 2) { - do { - fn_2way(ctx, wdst, wsrc); - - wsrc += bsize * 2; - wdst += bsize * 2; - nbytes -= bsize * 2; - } while (nbytes >= bsize * 2); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } - - return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way); -} + u128 iv = *src; -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way); -} + camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; + u128_xor(&dst[1], &dst[1], &iv); } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) { - struct blkcipher_walk walk; - int err; + be128 ctrblk; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + if (dst != src) + *dst = *src; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } + u128_to_be128(&ctrblk, iv); + u128_inc(iv); - return err; + camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, + u128 *iv) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[2 - 1]; - u128 last_iv; + be128 ctrblks[2]; - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process two block batch */ - if (nbytes >= bsize * 2) { - do { - nbytes -= bsize * (2 - 1); - src -= 2 - 1; - dst -= 2 - 1; - - ivs[0] = src[0]; - - camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); - - u128_xor(dst + 1, dst + 1, ivs + 0); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * 2); - - if (nbytes < bsize) - goto done; + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; } - /* Handle leftovers */ - for (;;) { - camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; + u128_to_be128(&ctrblks[0], iv); + u128_inc(iv); + u128_to_be128(&ctrblks[1], iv); + u128_inc(iv); - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } - -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; - - return nbytes; + camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); +static const struct common_glue_ctx camellia_enc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + } } +}; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } +static const struct common_glue_ctx camellia_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + } } +}; - return err; -} +static const struct common_glue_ctx camellia_dec = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + } } +}; -static inline void u128_to_be128(be128 *dst, const u128 *src) -{ - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); -} +static const struct common_glue_ctx camellia_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + } } +}; -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); + return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); } -static inline void u128_inc(u128 *i) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - i->b++; - if (!i->b) - i->a++; + return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 keystream[CAMELLIA_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - u128 ctrblk; - - memcpy(keystream, src, nbytes); - camellia_enc_blk_xor(ctx, keystream, walk->iv); - memcpy(dst, keystream, nbytes); - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - u128_inc(&ctrblk); - u128_to_be128((be128 *)walk->iv, &ctrblk); + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, + dst, src, nbytes); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - be128 ctrblocks[2]; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process two block batch */ - if (nbytes >= bsize * 2) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - } - - /* create ctrblks for parallel encrypt */ - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[1], &ctrblk); - u128_inc(&ctrblk); - - camellia_enc_blk_xor_2way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 2; - dst += 2; - nbytes -= bsize * 2; - } while (nbytes >= bsize * 2); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; + return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, + nbytes); } static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE); - - while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) { - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; + return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); } static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) diff --git a/crypto/Kconfig b/crypto/Kconfig index a86c2fb92ea..72828fafe09 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -671,6 +671,7 @@ config CRYPTO_CAMELLIA_X86_64 depends on X86 && 64BIT depends on CRYPTO select CRYPTO_ALGAPI + select CRYPTO_GLUE_HELPER_X86 select CRYPTO_LRW select CRYPTO_XTS help -- cgit v1.2.3-70-g09d2 From 414cb5e7cc6e258fe36e2c3cc3ef1ff2e246c0e3 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:34 +0300 Subject: crypto: twofish-x86_64-3way - remove duplicated glue code and use shared glue code from glue_helper Now that shared glue code is available, convert twofish-x86_64-3way to use it. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 365 +++++++++--------------------------- crypto/Kconfig | 1 + 2 files changed, 94 insertions(+), 272 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 77e4e55a266..25bf5e9b006 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -3,11 +3,6 @@ * * Copyright (c) 2011 Jussi Kivilinna * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -33,6 +28,7 @@ #include #include #include +#include #include #include @@ -62,311 +58,136 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, __twofish_enc_blk_3way(ctx, dst, src, true); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - void (*fn)(struct twofish_ctx *, u8 *, const u8 *), - void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *)) +static void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) { - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - fn_3way(ctx, wdst, wsrc); + u128 ivs[2]; - wsrc += bsize * 3; - wdst += bsize * 3; - nbytes -= bsize * 3; - } while (nbytes >= bsize * 3); + ivs[0] = src[0]; + ivs[1] = src[1]; - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } + twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); - return err; + u128_xor(&dst[1], &dst[1], &ivs[0]); + u128_xor(&dst[2], &dst[2], &ivs[1]); } -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) { - struct blkcipher_walk walk; + be128 ctrblk; - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); -} + if (dst != src) + *dst = *src; -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; + u128_to_be128(&ctrblk, iv); + u128_inc(iv); - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); + twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, dst, (u128 *)&ctrblk); } -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, + u128 *iv) { - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; -} + be128 ctrblks[3]; -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[3 - 1]; - u128 last_iv; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - nbytes -= bsize * (3 - 1); - src -= 3 - 1; - dst -= 3 - 1; - - ivs[0] = src[0]; - ivs[1] = src[1]; - - twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); - - u128_xor(dst + 1, dst + 1, ivs + 0); - u128_xor(dst + 2, dst + 2, ivs + 1); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; } -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; + u128_to_be128(&ctrblks[0], iv); + u128_inc(iv); + u128_to_be128(&ctrblks[1], iv); + u128_inc(iv); + u128_to_be128(&ctrblks[2], iv); + u128_inc(iv); - return nbytes; + twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); +static const struct common_glue_ctx twofish_enc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + } } +}; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } +static const struct common_glue_ctx twofish_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) } + } } +}; - return err; -} +static const struct common_glue_ctx twofish_dec = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + } } +}; -static inline void u128_to_be128(be128 *dst, const u128 *src) -{ - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); -} +static const struct common_glue_ctx twofish_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + } } +}; -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); + return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); } -static inline void u128_inc(u128 *i) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - i->b++; - if (!i->b) - i->a++; + return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[TF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - twofish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, TF_BLOCK_SIZE); + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, + dst, src, nbytes); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - be128 ctrblocks[3]; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - } - - /* create ctrblks for parallel encrypt */ - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[1], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[2], &ctrblk); - u128_inc(&ctrblk); - - twofish_enc_blk_xor_3way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 3; - dst += 3; - nbytes -= bsize * 3; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - u128_xor(dst, dst, (u128 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; + return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, + nbytes); } static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); - - while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; + return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); } static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) diff --git a/crypto/Kconfig b/crypto/Kconfig index 72828fafe09..fc559caaa9a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -935,6 +935,7 @@ config CRYPTO_TWOFISH_X86_64_3WAY select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 + select CRYPTO_GLUE_HELPER_X86 select CRYPTO_LRW select CRYPTO_XTS help -- cgit v1.2.3-70-g09d2 From a7378d4e552ac139ae1cbbdfebfeaa9b18c948d0 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:39 +0300 Subject: crypto: twofish-avx - remove duplicated glue code and use shared glue code from glue_helper Now that shared glue code is available, convert twofish-avx to use it. Cc: Johannes Goetzfried Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_avx_glue.c | 556 ++++++---------------------------- arch/x86/crypto/twofish_glue_3way.c | 46 +-- arch/x86/include/asm/crypto/twofish.h | 46 +++ crypto/Kconfig | 1 + 4 files changed, 162 insertions(+), 487 deletions(-) create mode 100644 arch/x86/include/asm/crypto/twofish.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index cabe058eba1..782b67ddaf6 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -4,9 +4,6 @@ * Copyright (C) 2012 Johannes Goetzfried * * - * Glue code based on serpent_sse2_glue.c by: - * Copyright (C) 2011 Jussi Kivilinna - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -39,38 +36,21 @@ #include #include #include +#include #include +#include #include #include #include - #define TWOFISH_PARALLEL_BLOCKS 8 -/* regular block cipher functions from twofish_x86_64 module */ -asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); - -/* 3-way parallel cipher functions from twofish_x86_64-3way module */ -asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); - static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, false); } -static inline void twofish_enc_blk_3way_xor(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, true); -} - /* 8-way parallel cipher functions */ asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, bool xor); @@ -95,423 +75,142 @@ static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, twofish_dec_blk_8way(ctx, dst, src); } - -static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src) { - if (fpu_enabled) - return true; + u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; + unsigned int j; - /* AVX is only used when chunk to be processed is large enough, so - * do not enable FPU until it is necessary. - */ - if (nbytes < TF_BLOCK_SIZE * TWOFISH_PARALLEL_BLOCKS) - return false; + for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; - kernel_fpu_begin(); - return true; -} + twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); -static inline void twofish_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); + for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) +static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) { - bool fpu_enabled = false; - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes; - int err; + be128 ctrblks[TWOFISH_PARALLEL_BLOCKS]; + unsigned int i; - err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - fpu_enabled = twofish_fpu_begin(fpu_enabled, nbytes); - - /* Process multi-block batch */ - if (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS) { - do { - if (enc) - twofish_enc_blk_xway(ctx, wdst, wsrc); - else - twofish_dec_blk_xway(ctx, wdst, wsrc); - - wsrc += bsize * TWOFISH_PARALLEL_BLOCKS; - wdst += bsize * TWOFISH_PARALLEL_BLOCKS; - nbytes -= bsize * TWOFISH_PARALLEL_BLOCKS; - } while (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - if (enc) - twofish_enc_blk_3way(ctx, wdst, wsrc); - else - twofish_dec_blk_3way(ctx, wdst, wsrc); - - wsrc += bsize * 3; - wdst += bsize * 3; - nbytes -= bsize * 3; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (enc) - twofish_enc_blk(ctx, wdst, wsrc); - else - twofish_dec_blk(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); } - twofish_fpu_end(fpu_enabled); - return err; + twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); } +static const struct common_glue_ctx twofish_enc = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } + }, { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + } } +}; + +static const struct common_glue_ctx twofish_ctr = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } + }, { + .num_blocks = 3, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } + } } +}; + +static const struct common_glue_ctx twofish_dec = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } + }, { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + } } +}; + +static const struct common_glue_ctx twofish_dec_cbc = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } + }, { + .num_blocks = 3, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + } } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); + return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); } static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); -} - -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; + return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); } static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; - u128 last_iv; - int i; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block batch */ - if (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (TWOFISH_PARALLEL_BLOCKS - 1); - src -= TWOFISH_PARALLEL_BLOCKS - 1; - dst -= TWOFISH_PARALLEL_BLOCKS - 1; - - for (i = 0; i < TWOFISH_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; - - twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (i = 0; i < TWOFISH_PARALLEL_BLOCKS - 1; i++) - u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - nbytes -= bsize * (3 - 1); - src -= 3 - 1; - dst -= 3 - 1; - - ivs[0] = src[0]; - ivs[1] = src[1]; - - twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); - - u128_xor(dst + 1, dst + 1, ivs + 0); - u128_xor(dst + 2, dst + 2, ivs + 1); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } - -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; - - return nbytes; + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, + dst, src, nbytes); } static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes)) { - fpu_enabled = twofish_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - twofish_fpu_end(fpu_enabled); - return err; -} - -static inline void u128_to_be128(be128 *dst, const u128 *src) -{ - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); + return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, + nbytes); } -static inline void be128_to_u128(u128 *dst, const be128 *src) -{ - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); -} - -static inline void u128_inc(u128 *i) +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - i->b++; - if (!i->b) - i->a++; + return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) { - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[TF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - twofish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, TF_BLOCK_SIZE); + return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL, + fpu_enabled, nbytes); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - be128 ctrblocks[TWOFISH_PARALLEL_BLOCKS]; - int i; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process multi-block batch */ - if (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS) { - do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - u128_to_be128(&ctrblocks[i], &ctrblk); - u128_inc(&ctrblk); - } - - twofish_enc_blk_xway_xor(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += TWOFISH_PARALLEL_BLOCKS; - dst += TWOFISH_PARALLEL_BLOCKS; - nbytes -= bsize * TWOFISH_PARALLEL_BLOCKS; - } while (nbytes >= bsize * TWOFISH_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - } - - /* create ctrblks for parallel encrypt */ - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[1], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[2], &ctrblk); - u128_inc(&ctrblk); - - twofish_enc_blk_3way_xor(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 3; - dst += 3; - nbytes -= bsize * 3; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - u128_xor(dst, dst, (u128 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static inline void twofish_fpu_end(bool fpu_enabled) { - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { - fpu_enabled = twofish_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - twofish_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; + glue_fpu_end(fpu_enabled); } struct crypt_priv { @@ -563,26 +262,6 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) twofish_dec_blk(ctx->ctx, srcdst, srcdst); } -struct twofish_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct twofish_ctx twofish_ctx; -}; - -static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = __twofish_setkey(&ctx->twofish_ctx, key, - keylen - TF_BLOCK_SIZE, &tfm->crt_flags); - if (err) - return err; - - return lrw_init_table(&ctx->lrw_table, key + keylen - - TF_BLOCK_SIZE); -} - static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -635,43 +314,6 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } -static void lrw_exit_tfm(struct crypto_tfm *tfm) -{ - struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); -} - -struct twofish_xts_ctx { - struct twofish_ctx tweak_ctx; - struct twofish_ctx crypt_ctx; -}; - -static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; - int err; - - /* key consists of keys of equal size concatenated, therefore - * the length must be even - */ - if (keylen % 2) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } - - /* first half of xts-key is for crypt */ - err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __twofish_setkey(&ctx->tweak_ctx, - key + keylen / 2, keylen / 2, flags); -} - static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -798,7 +440,7 @@ static struct crypto_alg twofish_algs[10] = { { .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(twofish_algs[3].cra_list), - .cra_exit = lrw_exit_tfm, + .cra_exit = lrw_twofish_exit_tfm, .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE + diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 25bf5e9b006..15f9347316c 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -28,22 +28,12 @@ #include #include #include +#include #include #include #include -/* regular block cipher functions from twofish_x86_64 module */ -asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); - -/* 3-way parallel cipher functions */ -asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, bool xor); EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); -asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); EXPORT_SYMBOL_GPL(twofish_dec_blk_3way); static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, @@ -58,7 +48,7 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, __twofish_enc_blk_3way(ctx, dst, src, true); } -static void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) +void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) { u128 ivs[2]; @@ -70,8 +60,9 @@ static void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) u128_xor(&dst[1], &dst[1], &ivs[0]); u128_xor(&dst[2], &dst[2], &ivs[1]); } +EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -static void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) { be128 ctrblk; @@ -84,8 +75,9 @@ static void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); u128_xor(dst, dst, (u128 *)&ctrblk); } +EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); -static void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, +void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, u128 *iv) { be128 ctrblks[3]; @@ -105,6 +97,7 @@ static void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); } +EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way); static const struct common_glue_ctx twofish_enc = { .num_funcs = 2, @@ -220,13 +213,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) twofish_dec_blk(ctx, srcdst, srcdst); } -struct twofish_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct twofish_ctx twofish_ctx; -}; - -static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); int err; @@ -238,6 +226,7 @@ static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); } +EXPORT_SYMBOL_GPL(lrw_twofish_setkey); static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -273,20 +262,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return lrw_crypt(desc, dst, src, nbytes, &req); } -static void lrw_exit_tfm(struct crypto_tfm *tfm) +void lrw_twofish_exit_tfm(struct crypto_tfm *tfm) { struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); lrw_free_table(&ctx->lrw_table); } +EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm); -struct twofish_xts_ctx { - struct twofish_ctx tweak_ctx; - struct twofish_ctx crypt_ctx; -}; - -static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; @@ -309,6 +294,7 @@ static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, flags); } +EXPORT_SYMBOL_GPL(xts_twofish_setkey); static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -419,7 +405,7 @@ static struct crypto_alg tf_algs[5] = { { .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list), - .cra_exit = lrw_exit_tfm, + .cra_exit = lrw_twofish_exit_tfm, .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h new file mode 100644 index 00000000000..9d2c514bd5f --- /dev/null +++ b/arch/x86/include/asm/crypto/twofish.h @@ -0,0 +1,46 @@ +#ifndef ASM_X86_TWOFISH_H +#define ASM_X86_TWOFISH_H + +#include +#include +#include +#include + +struct twofish_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct twofish_ctx twofish_ctx; +}; + +struct twofish_xts_ctx { + struct twofish_ctx tweak_ctx; + struct twofish_ctx crypt_ctx; +}; + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +/* helpers from twofish_x86_64-3way module */ +extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); +extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, + u128 *iv); +extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, + u128 *iv); + +extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); + +extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); + +extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); + +#endif /* ASM_X86_TWOFISH_H */ diff --git a/crypto/Kconfig b/crypto/Kconfig index fc559caaa9a..02e30377469 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -958,6 +958,7 @@ config CRYPTO_TWOFISH_AVX_X86_64 select CRYPTO_ALGAPI select CRYPTO_CRYPTD select CRYPTO_ABLK_HELPER_X86 + select CRYPTO_GLUE_HELPER_X86 select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 select CRYPTO_TWOFISH_X86_64_3WAY -- cgit v1.2.3-70-g09d2 From d4af0e9d6eef6ce53c1935ca6ee3c01889e3212d Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:45 +0300 Subject: crypto: move arch/x86/include/asm/serpent-{sse2|avx}.h to arch/x86/include/asm/crypto/ Move serpent crypto headers to the new asm/crypto/ directory. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_avx_glue.c | 2 +- arch/x86/crypto/serpent_sse2_glue.c | 2 +- arch/x86/include/asm/crypto/serpent-avx.h | 32 +++++++++++++++ arch/x86/include/asm/crypto/serpent-sse2.h | 63 ++++++++++++++++++++++++++++++ arch/x86/include/asm/serpent-avx.h | 32 --------------- arch/x86/include/asm/serpent-sse2.h | 63 ------------------------------ 6 files changed, 97 insertions(+), 97 deletions(-) create mode 100644 arch/x86/include/asm/crypto/serpent-avx.h create mode 100644 arch/x86/include/asm/crypto/serpent-sse2.h delete mode 100644 arch/x86/include/asm/serpent-avx.h delete mode 100644 arch/x86/include/asm/serpent-sse2.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index c61b91aa42a..b36bdac237e 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 49a32eedf0c..d679c8675f4 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h new file mode 100644 index 00000000000..432deedd294 --- /dev/null +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -0,0 +1,32 @@ +#ifndef ASM_X86_SERPENT_AVX_H +#define ASM_X86_SERPENT_AVX_H + +#include +#include + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way_avx(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way_avx(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_8way_avx(ctx, dst, src); +} + +#endif diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/include/asm/crypto/serpent-sse2.h new file mode 100644 index 00000000000..e6e77dffbda --- /dev/null +++ b/arch/x86/include/asm/crypto/serpent-sse2.h @@ -0,0 +1,63 @@ +#ifndef ASM_X86_SERPENT_SSE2_H +#define ASM_X86_SERPENT_SSE2_H + +#include +#include + +#ifdef CONFIG_X86_32 + +#define SERPENT_PARALLEL_BLOCKS 4 + +asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_4way(ctx, dst, src); +} + +#else + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_8way(ctx, dst, src); +} + +#endif + +#endif diff --git a/arch/x86/include/asm/serpent-avx.h b/arch/x86/include/asm/serpent-avx.h deleted file mode 100644 index 432deedd294..00000000000 --- a/arch/x86/include/asm/serpent-avx.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ASM_X86_SERPENT_AVX_H -#define ASM_X86_SERPENT_AVX_H - -#include -#include - -#define SERPENT_PARALLEL_BLOCKS 8 - -asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way_avx(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way_avx(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - serpent_dec_blk_8way_avx(ctx, dst, src); -} - -#endif diff --git a/arch/x86/include/asm/serpent-sse2.h b/arch/x86/include/asm/serpent-sse2.h deleted file mode 100644 index e6e77dffbda..00000000000 --- a/arch/x86/include/asm/serpent-sse2.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef ASM_X86_SERPENT_SSE2_H -#define ASM_X86_SERPENT_SSE2_H - -#include -#include - -#ifdef CONFIG_X86_32 - -#define SERPENT_PARALLEL_BLOCKS 4 - -asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_4way(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_4way(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - serpent_dec_blk_4way(ctx, dst, src); -} - -#else - -#define SERPENT_PARALLEL_BLOCKS 8 - -asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - serpent_dec_blk_8way(ctx, dst, src); -} - -#endif - -#endif -- cgit v1.2.3-70-g09d2 From 70ef2601feb09d40f4086d055700b7923b3c2d6f Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 18 Jun 2012 14:07:50 +0300 Subject: crypto: move arch/x86/include/asm/aes.h to arch/x86/include/asm/crypto/ Move AES header to the new asm/crypto directory. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/aes_glue.c | 2 +- arch/x86/crypto/aesni-intel_glue.c | 2 +- arch/x86/include/asm/aes.h | 11 ----------- arch/x86/include/asm/crypto/aes.h | 11 +++++++++++ 4 files changed, 13 insertions(+), 13 deletions(-) delete mode 100644 arch/x86/include/asm/aes.h create mode 100644 arch/x86/include/asm/crypto/aes.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 8efcf42a9d7..59b37deb8c8 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -5,7 +5,7 @@ #include #include -#include +#include asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 7c9d54d8dc4..d6626152067 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/aes.h deleted file mode 100644 index 80545a1cbe3..00000000000 --- a/arch/x86/include/asm/aes.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef ASM_X86_AES_H -#define ASM_X86_AES_H - -#include -#include - -void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, - const u8 *src); -void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, - const u8 *src); -#endif diff --git a/arch/x86/include/asm/crypto/aes.h b/arch/x86/include/asm/crypto/aes.h new file mode 100644 index 00000000000..80545a1cbe3 --- /dev/null +++ b/arch/x86/include/asm/crypto/aes.h @@ -0,0 +1,11 @@ +#ifndef ASM_X86_AES_H +#define ASM_X86_AES_H + +#include +#include + +void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, + const u8 *src); +void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, + const u8 *src); +#endif -- cgit v1.2.3-70-g09d2 From e0ba94f14f747c2661c4d21f8c44e5b0b8cd8e48 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:16 +0800 Subject: x86/tlb_info: get last level TLB entry number of CPU For 4KB pages, x86 CPU has 2 or 1 level TLB, first level is data TLB and instruction TLB, second level is shared TLB for both data and instructions. For hupe page TLB, usually there is just one level and seperated by 2MB/4MB and 1GB. Although each levels TLB size is important for performance tuning, but for genernal and rude optimizing, last level TLB entry number is suitable. And in fact, last level TLB always has the biggest entry number. This patch will get the biggest TLB entry number and use it in furture TLB optimizing. Accroding Borislav's suggestion, except tlb_ll[i/d]_* array, other function and data will be released after system boot up. For all kinds of x86 vendor friendly, vendor specific code was moved to its specific files. Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-2-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 11 +++ arch/x86/kernel/cpu/common.c | 21 ++++++ arch/x86/kernel/cpu/cpu.h | 9 +++ arch/x86/kernel/cpu/intel.c | 142 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 183 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 39bc5777211..39b2bd48dfb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -61,6 +61,17 @@ static inline void *current_text_addr(void) # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif +enum tlb_infos { + ENTRIES, + NR_INFO +}; + +extern u16 __read_mostly tlb_lli_4k[NR_INFO]; +extern u16 __read_mostly tlb_lli_2m[NR_INFO]; +extern u16 __read_mostly tlb_lli_4m[NR_INFO]; +extern u16 __read_mostly tlb_lld_4k[NR_INFO]; +extern u16 __read_mostly tlb_lld_2m[NR_INFO]; +extern u16 __read_mostly tlb_lld_4m[NR_INFO]; /* * CPU type and hardware bug flags. Kept separately for each CPU. * Members of this structure are referenced in head.S, so think twice diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b9333b429b..b2016df0081 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -452,6 +452,25 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } +u16 __read_mostly tlb_lli_4k[NR_INFO]; +u16 __read_mostly tlb_lli_2m[NR_INFO]; +u16 __read_mostly tlb_lli_4m[NR_INFO]; +u16 __read_mostly tlb_lld_4k[NR_INFO]; +u16 __read_mostly tlb_lld_2m[NR_INFO]; +u16 __read_mostly tlb_lld_4m[NR_INFO]; + +void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) +{ + if (this_cpu->c_detect_tlb) + this_cpu->c_detect_tlb(c); + + printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n", + tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], + tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], + tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES]); +} + void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT @@ -911,6 +930,8 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + if (boot_cpu_data.cpuid_level >= 2) + cpu_detect_tlb(&boot_cpu_data); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 8bacc7826fb..4041c24ae7d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -20,10 +20,19 @@ struct cpu_dev { void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); + void (*c_detect_tlb)(struct cpuinfo_x86 *); unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); int c_x86_vendor; }; +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; + /* unsigned int ways; */ + char info[128]; +}; + #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __attribute__((__section__(".x86_cpu_dev.init"))) = \ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3e6ff6cbf42..ed0d512cf51 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -491,6 +491,147 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i } #endif +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 + +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 + +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 + +#define TLB_DATA_1G 0x16 + +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 + +#define STLB_4K 0x41 + +static const struct _tlb_table intel_tlb_table[] __cpuinitconst = { + { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, + { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, + { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, + { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, + { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, + { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, + { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, + { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, + { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x00, 0, 0 } +}; + +static void __cpuinit intel_tlb_lookup(const unsigned char desc) +{ + unsigned char k; + if (desc == 0) + return; + + /* look up this descriptor in the table */ + for (k = 0; intel_tlb_table[k].descriptor != desc && \ + intel_tlb_table[k].descriptor != 0; k++) + ; + + if (intel_tlb_table[k].tlb_type == 0) + return; + + switch (intel_tlb_table[k].tlb_type) { + case STLB_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_ALL: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4M: + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_2M_4M: + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K: + case TLB_DATA0_4K: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4M: + case TLB_DATA0_4M: + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_2M_4M: + case TLB_DATA0_2M_4M: + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K_4M: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + } +} + +static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) +{ + int i, j, n; + unsigned int regs[4]; + unsigned char *desc = (unsigned char *)regs; + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) + intel_tlb_lookup(desc[j]); + } +} + static const struct cpu_dev __cpuinitconst intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, @@ -546,6 +687,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = { }, .c_size_cache = intel_size_cache, #endif + .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, -- cgit v1.2.3-70-g09d2 From e7b52ffd45a6d834473f43b349e7d86593d763c7 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:17 +0800 Subject: x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range x86 has no flush_tlb_range support in instruction level. Currently the flush_tlb_range just implemented by flushing all page table. That is not the best solution for all scenarios. In fact, if we just use 'invlpg' to flush few lines from TLB, we can get the performance gain from later remain TLB lines accessing. But the 'invlpg' instruction costs much of time. Its execution time can compete with cr3 rewriting, and even a bit more on SNB CPU. So, on a 512 4KB TLB entries CPU, the balance points is at: (512 - X) * 100ns(assumed TLB refill cost) = X(TLB flush entries) * 100ns(assumed invlpg cost) Here, X is 256, that is 1/2 of 512 entries. But with the mysterious CPU pre-fetcher and page miss handler Unit, the assumed TLB refill cost is far lower then 100ns in sequential access. And 2 HT siblings in one core makes the memory access more faster if they are accessing the same memory. So, in the patch, I just do the change when the target entries is less than 1/16 of whole active tlb entries. Actually, I have no data support for the percentage '1/16', so any suggestions are welcomed. As to hugetlb, guess due to smaller page table, and smaller active TLB entries, I didn't see benefit via my benchmark, so no optimizing now. My micro benchmark show in ideal scenarios, the performance improves 70 percent in reading. And in worst scenario, the reading/writing performance is similar with unpatched 3.4-rc4 kernel. Here is the reading data on my 2P * 4cores *HT NHM EP machine, with THP 'always': multi thread testing, '-t' paramter is thread number: with patch unpatched 3.4-rc4 ./mprotect -t 1 14ns 24ns ./mprotect -t 2 13ns 22ns ./mprotect -t 4 12ns 19ns ./mprotect -t 8 14ns 16ns ./mprotect -t 16 28ns 26ns ./mprotect -t 32 54ns 51ns ./mprotect -t 128 200ns 199ns Single process with sequencial flushing and memory accessing: with patch unpatched 3.4-rc4 ./mprotect 7ns 11ns ./mprotect -p 4096 -l 8 -n 10240 21ns 21ns [ hpa: http://lkml.kernel.org/r/1B4B44D9196EFF41AE41FDA404FC0A100BFF94@SHSMSX101.ccr.corp.intel.com has additional performance numbers. ] Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-3-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 5 +- arch/x86/include/asm/paravirt_types.h | 3 +- arch/x86/include/asm/tlbflush.h | 23 ++++----- arch/x86/include/asm/uv/uv.h | 5 +- arch/x86/mm/tlb.c | 97 +++++++++++++++++++++++++++++------ arch/x86/platform/uv/tlb_uv.c | 6 +-- arch/x86/xen/mmu.c | 12 ++--- include/trace/events/xen.h | 12 +++-- 8 files changed, 114 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf5270..7e2c2a63573 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -397,9 +397,10 @@ static inline void __flush_tlb_single(unsigned long addr) static inline void flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, - unsigned long va) + unsigned long start, + unsigned long end) { - PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va); + PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8e8b9a4987e..600a5fcac9c 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -250,7 +250,8 @@ struct pv_mmu_ops { void (*flush_tlb_single)(unsigned long addr); void (*flush_tlb_others)(const struct cpumask *cpus, struct mm_struct *mm, - unsigned long va); + unsigned long start, + unsigned long end); /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 36a1a2ab87d..33608d96d68 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -73,14 +73,10 @@ static inline void __flush_tlb_one(unsigned long addr) * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus + * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. - * - * x86-64 can only flush individual pages or full VMs. For a range flush - * we always do the full VM. Might be worth trying if for a small - * range a few INVLPGs in a row are a win. */ #ifndef CONFIG_SMP @@ -111,7 +107,8 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, static inline void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, - unsigned long va) + unsigned long start, + unsigned long end) { } @@ -129,17 +126,14 @@ extern void flush_tlb_all(void); extern void flush_tlb_current_task(void); extern void flush_tlb_mm(struct mm_struct *); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); +extern void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); #define flush_tlb() flush_tlb_current_task() -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - flush_tlb_mm(vma->vm_mm); -} - void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va); + struct mm_struct *mm, + unsigned long start, unsigned long end); #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 @@ -159,7 +153,8 @@ static inline void reset_lazy_tlbstate(void) #endif /* SMP */ #ifndef CONFIG_PARAVIRT -#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va) +#define flush_tlb_others(mask, mm, start, end) \ + native_flush_tlb_others(mask, mm, start, end) #endif static inline void flush_tlb_kernel_range(unsigned long start, diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 3bb9491b765..b47c2a82ff1 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -15,7 +15,8 @@ extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, - unsigned long va, + unsigned long start, + unsigned end, unsigned int cpu); #else /* X86_UV */ @@ -26,7 +27,7 @@ static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, - unsigned long va, unsigned int cpu) + unsigned long start, unsigned long end, unsigned int cpu) { return cpumask; } #endif /* X86_UV */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5e57e113b72..3b91c981a27 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -41,7 +41,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) union smp_flush_state { struct { struct mm_struct *flush_mm; - unsigned long flush_va; + unsigned long flush_start; + unsigned long flush_end; raw_spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; @@ -156,10 +157,19 @@ void smp_invalidate_interrupt(struct pt_regs *regs) if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) { if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_va == TLB_FLUSH_ALL) + if (f->flush_end == TLB_FLUSH_ALL + || !cpu_has_invlpg) local_flush_tlb(); - else - __flush_tlb_one(f->flush_va); + else if (!f->flush_end) + __flush_tlb_single(f->flush_start); + else { + unsigned long addr; + addr = f->flush_start; + while (addr < f->flush_end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; + } + } } else leave_mm(cpu); } @@ -172,7 +182,8 @@ out: } static void flush_tlb_others_ipi(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va) + struct mm_struct *mm, unsigned long start, + unsigned long end) { unsigned int sender; union smp_flush_state *f; @@ -185,7 +196,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; - f->flush_va = va; + f->flush_start = start; + f->flush_end = end; if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { /* * We have to send the IPI only to @@ -199,24 +211,26 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, } f->flush_mm = NULL; - f->flush_va = 0; + f->flush_start = 0; + f->flush_end = 0; if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va) + struct mm_struct *mm, unsigned long start, + unsigned long end) { if (is_uv_system()) { unsigned int cpu; cpu = smp_processor_id(); - cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); + cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); if (cpumask) - flush_tlb_others_ipi(cpumask, mm, va); + flush_tlb_others_ipi(cpumask, mm, start, end); return; } - flush_tlb_others_ipi(cpumask, mm, va); + flush_tlb_others_ipi(cpumask, mm, start, end); } static void __cpuinit calculate_tlb_offset(void) @@ -282,7 +296,7 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); + flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } @@ -297,12 +311,63 @@ void flush_tlb_mm(struct mm_struct *mm) leave_mm(smp_processor_id()); } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); + flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); + + preempt_enable(); +} + +#define FLUSHALL_BAR 16 + +void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm; + + if (!cpu_has_invlpg || vma->vm_flags & VM_HUGETLB) { + flush_tlb_mm(vma->vm_mm); + return; + } + + preempt_disable(); + mm = vma->vm_mm; + if (current->active_mm == mm) { + if (current->mm) { + unsigned long addr, vmflag = vma->vm_flags; + unsigned act_entries, tlb_entries = 0; + + if (vmflag & VM_EXEC) + tlb_entries = tlb_lli_4k[ENTRIES]; + else + tlb_entries = tlb_lld_4k[ENTRIES]; + + act_entries = tlb_entries > mm->total_vm ? + mm->total_vm : tlb_entries; + if ((end - start)/PAGE_SIZE > act_entries/FLUSHALL_BAR) + local_flush_tlb(); + else { + for (addr = start; addr < end; + addr += PAGE_SIZE) + __flush_tlb_single(addr); + + if (cpumask_any_but(mm_cpumask(mm), + smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, + start, end); + preempt_enable(); + return; + } + } else { + leave_mm(smp_processor_id()); + } + } + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) { struct mm_struct *mm = vma->vm_mm; @@ -310,13 +375,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) if (current->active_mm == mm) { if (current->mm) - __flush_tlb_one(va); + __flush_tlb_one(start); else leave_mm(smp_processor_id()); } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, va); + flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); preempt_enable(); } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 59880afa851..f1bef8e1d63 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1068,8 +1068,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * done. The returned pointer is valid till preemption is re-enabled. */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va, - unsigned int cpu) + struct mm_struct *mm, unsigned long start, + unsigned end, unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1112,7 +1112,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, record_send_statistics(stat, locals, hubs, remotes, bau_desc); - bau_desc->payload.address = va; + bau_desc->payload.address = start; bau_desc->payload.sending_cpu = cpu; /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785631c..39ed56789f6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1244,7 +1244,8 @@ static void xen_flush_tlb_single(unsigned long addr) } static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long va) + struct mm_struct *mm, unsigned long start, + unsigned long end) { struct { struct mmuext_op op; @@ -1256,7 +1257,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_others(cpus, mm, va); + trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1269,11 +1270,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; + args->op.arg1.linear_addr = start; } MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index 92f1a796829..15ba03bdd7c 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -397,18 +397,20 @@ TRACE_EVENT(xen_mmu_flush_tlb_single, TRACE_EVENT(xen_mmu_flush_tlb_others, TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm, - unsigned long addr), - TP_ARGS(cpus, mm, addr), + unsigned long addr, unsigned long end), + TP_ARGS(cpus, mm, addr, end), TP_STRUCT__entry( __field(unsigned, ncpus) __field(struct mm_struct *, mm) __field(unsigned long, addr) + __field(unsigned long, end) ), TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); __entry->mm = mm; - __entry->addr = addr), - TP_printk("ncpus %d mm %p addr %lx", - __entry->ncpus, __entry->mm, __entry->addr) + __entry->addr = addr, + __entry->end = end), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, __entry->addr, __entry->end) ); TRACE_EVENT(xen_mmu_write_cr3, -- cgit v1.2.3-70-g09d2 From d8dfe60d6dcad5989c4558b753b98d657e2813c0 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:18 +0800 Subject: x86/tlb: fall back to flush all when meet a THP large page We don't need to flush large pages by PAGE_SIZE step, that just waste time. and actually, large page don't need 'invlpg' optimizing according to our micro benchmark. So, just flush whole TLB is enough for them. The following result is tested on a 2CPU * 4cores * 2HT NHM EP machine, with THP 'always' setting. Multi-thread testing, '-t' paramter is thread number: without this patch with this patch ./mprotect -t 1 14ns 13ns ./mprotect -t 2 13ns 13ns ./mprotect -t 4 12ns 11ns ./mprotect -t 8 14ns 10ns ./mprotect -t 16 28ns 28ns ./mprotect -t 32 54ns 52ns ./mprotect -t 128 200ns 200ns Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-4-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/tlb.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3b91c981a27..184a02a4d87 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -318,12 +318,42 @@ void flush_tlb_mm(struct mm_struct *mm) #define FLUSHALL_BAR 16 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline unsigned long has_large_page(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned long addr = ALIGN(start, HPAGE_SIZE); + for (; addr < end; addr += HPAGE_SIZE) { + pgd = pgd_offset(mm, addr); + if (likely(!pgd_none(*pgd))) { + pud = pud_offset(pgd, addr); + if (likely(!pud_none(*pud))) { + pmd = pmd_offset(pud, addr); + if (likely(!pmd_none(*pmd))) + if (pmd_large(*pmd)) + return addr; + } + } + } + return 0; +} +#else +static inline unsigned long has_large_page(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + return 0; +} +#endif void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm; if (!cpu_has_invlpg || vma->vm_flags & VM_HUGETLB) { +flush_all: flush_tlb_mm(vma->vm_mm); return; } @@ -346,6 +376,10 @@ void flush_tlb_range(struct vm_area_struct *vma, if ((end - start)/PAGE_SIZE > act_entries/FLUSHALL_BAR) local_flush_tlb(); else { + if (has_large_page(mm, start, end)) { + preempt_enable(); + goto flush_all; + } for (addr = start; addr < end; addr += PAGE_SIZE) __flush_tlb_single(addr); -- cgit v1.2.3-70-g09d2 From c4211f42d3e66875298a5e26a75109878c80f15b Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:19 +0800 Subject: x86/tlb: add tlb_flushall_shift for specific CPU Testing show different CPU type(micro architectures and NUMA mode) has different balance points between the TLB flush all and multiple invlpg. And there also has cases the tlb flush change has no any help. This patch give a interface to let x86 vendor developers have a chance to set different shift for different CPU type. like some machine in my hands, balance points is 16 entries on Romely-EP; while it is at 8 entries on Bloomfield NHM-EP; and is 256 on IVB mobile CPU. but on model 15 core2 Xeon using invlpg has nothing help. For untested machine, do a conservative optimization, same as NHM CPU. Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-5-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/common.c | 14 ++++++++++++-- arch/x86/kernel/cpu/intel.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/tlb.c | 7 +++---- include/asm-generic/tlb.h | 3 ++- 5 files changed, 53 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 39b2bd48dfb..d048cad9bca 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -72,6 +72,8 @@ extern u16 __read_mostly tlb_lli_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_4k[NR_INFO]; extern u16 __read_mostly tlb_lld_2m[NR_INFO]; extern u16 __read_mostly tlb_lld_4m[NR_INFO]; +extern s8 __read_mostly tlb_flushall_shift; + /* * CPU type and hardware bug flags. Kept separately for each CPU. * Members of this structure are referenced in head.S, so think twice diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b2016df0081..7595552600b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -459,16 +459,26 @@ u16 __read_mostly tlb_lld_4k[NR_INFO]; u16 __read_mostly tlb_lld_2m[NR_INFO]; u16 __read_mostly tlb_lld_4m[NR_INFO]; +/* + * tlb_flushall_shift shows the balance point in replacing cr3 write + * with multiple 'invlpg'. It will do this replacement when + * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. + * If tlb_flushall_shift is -1, means the replacement will be disabled. + */ +s8 __read_mostly tlb_flushall_shift = -1; + void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) { if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ - "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n", + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ + "tlb_flushall_shift is 0x%x\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], - tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES]); + tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], + tlb_flushall_shift); } void __cpuinit detect_ht(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ed0d512cf51..0a4ce2980a5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -610,6 +610,39 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc) } } +static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) +{ + if (!cpu_has_invlpg) { + tlb_flushall_shift = -1; + return; + } + switch ((c->x86 << 8) + c->x86_model) { + case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 0x61d: /* six-core 45 nm xeon "Dunnington" */ + tlb_flushall_shift = -1; + break; + case 0x61a: /* 45 nm nehalem, "Bloomfield" */ + case 0x61e: /* 45 nm nehalem, "Lynnfield" */ + case 0x625: /* 32 nm nehalem, "Clarkdale" */ + case 0x62c: /* 32 nm nehalem, "Gulftown" */ + case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ + case 0x62f: /* 32 nm Xeon E7 */ + tlb_flushall_shift = 6; + break; + case 0x62a: /* SandyBridge */ + case 0x62d: /* SandyBridge, "Romely-EP" */ + tlb_flushall_shift = 5; + break; + case 0x63a: /* Ivybridge */ + tlb_flushall_shift = 1; + break; + default: + tlb_flushall_shift = 6; + } +} + static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) { int i, j, n; @@ -630,6 +663,7 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) for (j = 1 ; j < 16 ; j++) intel_tlb_lookup(desc[j]); } + intel_tlb_flushall_shift_set(c); } static const struct cpu_dev __cpuinitconst intel_cpu_dev = { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 184a02a4d87..2939f2f9edb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -316,8 +316,6 @@ void flush_tlb_mm(struct mm_struct *mm) preempt_enable(); } -#define FLUSHALL_BAR 16 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline unsigned long has_large_page(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -352,7 +350,7 @@ void flush_tlb_range(struct vm_area_struct *vma, { struct mm_struct *mm; - if (!cpu_has_invlpg || vma->vm_flags & VM_HUGETLB) { + if (vma->vm_flags & VM_HUGETLB || tlb_flushall_shift == -1) { flush_all: flush_tlb_mm(vma->vm_mm); return; @@ -373,7 +371,8 @@ flush_all: act_entries = tlb_entries > mm->total_vm ? mm->total_vm : tlb_entries; - if ((end - start)/PAGE_SIZE > act_entries/FLUSHALL_BAR) + if ((end - start) >> PAGE_SHIFT > + act_entries >> tlb_flushall_shift) local_flush_tlb(); else { if (has_large_page(mm, start, end)) { diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f96a5b58a97..75e888b3cfd 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -113,7 +113,8 @@ static inline int tlb_fast_mode(struct mmu_gather *tlb) void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm); void tlb_flush_mmu(struct mmu_gather *tlb); -void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, + unsigned long end); int __tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* tlb_remove_page -- cgit v1.2.3-70-g09d2 From 3df3212f9722c7e45c723b9ea231a04ba4dbc47c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:20 +0800 Subject: x86/tlb: add tlb_flushall_shift knob into debugfs kernel will replace cr3 rewrite with invlpg when tlb_flush_entries <= active_tlb_entries / 2^tlb_flushall_factor if tlb_flushall_factor is -1, kernel won't do this replacement. User can modify its value according to specific CPU/applications. Thanks for Borislav providing the help message of CONFIG_DEBUG_TLBFLUSH. Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-6-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.debug | 19 +++++++++++++++++++ arch/x86/mm/tlb.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index e46c2147397..b322f124ee3 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -129,6 +129,25 @@ config DOUBLEFAULT option saves about 4k and might cause you much additional grey hair. +config DEBUG_TLBFLUSH + bool "Set upper limit of TLB entries to flush one-by-one" + depends on DEBUG_KERNEL && (X86_64 || X86_INVLPG) + ---help--- + + X86-only for now. + + This option allows the user to tune the amount of TLB entries the + kernel flushes one-by-one instead of doing a full TLB flush. In + certain situations, the former is cheaper. This is controlled by the + tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it + to -1, the code flushes the whole TLB unconditionally. Otherwise, + for positive values of it, the kernel will use single TLB entry + invalidating instructions according to the following formula: + + flush_entries <= active_tlb_entries / 2^tlb_flushall_shift + + If in doubt, say "N". + config IOMMU_DEBUG bool "Enable IOMMU debugging" depends on GART_IOMMU && DEBUG_KERNEL diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 2939f2f9edb..5911f61e300 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include #include #include +#include DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; @@ -430,3 +431,53 @@ void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, NULL, 1); } + +#ifdef CONFIG_DEBUG_TLBFLUSH +static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "%hd\n", tlb_flushall_shift); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t tlbflush_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + s8 shift; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + buf[len] = '\0'; + if (kstrtos8(buf, 0, &shift)) + return -EINVAL; + + if (shift > 64) + return -EINVAL; + + tlb_flushall_shift = shift; + return count; +} + +static const struct file_operations fops_tlbflush = { + .read = tlbflush_read_file, + .write = tlbflush_write_file, + .llseek = default_llseek, +}; + +static int __cpuinit create_tlb_flushall_shift(void) +{ + if (cpu_has_invlpg) { + debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); + } + return 0; +} +late_initcall(create_tlb_flushall_shift); +#endif -- cgit v1.2.3-70-g09d2 From 611ae8e3f5204f7480b3b405993b3352cfa16662 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:22 +0800 Subject: x86/tlb: enable tlb flush range support for x86 Not every tlb_flush execution moment is really need to evacuate all TLB entries, like in munmap, just few 'invlpg' is better for whole process performance, since it leaves most of TLB entries for later accessing. This patch also rewrite flush_tlb_range for 2 purposes: 1, split it out to get flush_blt_mm_range function. 2, clean up to reduce line breaking, thanks for Borislav's input. My micro benchmark 'mummap' http://lkml.org/lkml/2012/5/17/59 show that the random memory access on other CPU has 0~50% speed up on a 2P * 4cores * HT NHM EP while do 'munmap'. Thanks Yongjie's testing on this patch: ------------- I used Linux 3.4-RC6 w/ and w/o his patches as Xen dom0 and guest kernel. After running two benchmarks in Xen HVM guest, I found his patches brought about 1%~3% performance gain in 'kernel build' and 'netperf' testing, though the performance gain was not very stable in 'kernel build' testing. Some detailed testing results are below. Testing Environment: Hardware: Romley-EP platform Xen version: latest upstream Linux kernel: 3.4-RC6 Guest vCPU number: 8 NIC: Intel 82599 (10GB bandwidth) In 'kernel build' testing in guest: Command line | performance gain make -j 4 | 3.81% make -j 8 | 0.37% make -j 16 | -0.52% In 'netperf' testing, we tested TCP_STREAM with default socket size 16384 byte as large packet and 64 byte as small packet. I used several clients to add networking pressure, then 'netperf' server automatically generated several threads to response them. I also used large-size packet and small-size packet in the testing. Packet size | Thread number | performance gain 16384 bytes | 4 | 0.02% 16384 bytes | 8 | 2.21% 16384 bytes | 16 | 2.04% 64 bytes | 4 | 1.07% 64 bytes | 8 | 3.31% 64 bytes | 16 | 0.71% Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-8-git-send-email-alex.shi@intel.com Tested-by: Ren, Yongjie Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/tlb.h | 9 +++- arch/x86/include/asm/tlbflush.h | 17 ++++-- arch/x86/mm/tlb.c | 112 +++++++++++++++++----------------------- 3 files changed, 68 insertions(+), 70 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 829215fef9e..4fef20773b8 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -4,7 +4,14 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) + +#define tlb_flush(tlb) \ +{ \ + if (tlb->fullmm == 0) \ + flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \ + else \ + flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \ +} #include diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 33608d96d68..621b959e1db 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -105,6 +105,13 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, __flush_tlb(); } +static inline void flush_tlb_mm_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, unsigned long vmflag) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + static inline void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, @@ -122,12 +129,16 @@ static inline void reset_lazy_tlbstate(void) #define local_flush_tlb() __flush_tlb() +#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) + +#define flush_tlb_range(vma, start, end) \ + flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) + extern void flush_tlb_all(void); extern void flush_tlb_current_task(void); -extern void flush_tlb_mm(struct mm_struct *); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); -extern void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); +extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag); #define flush_tlb() flush_tlb_current_task() diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5911f61e300..481737def84 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -301,23 +301,10 @@ void flush_tlb_current_task(void) preempt_enable(); } -void flush_tlb_mm(struct mm_struct *mm) -{ - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); - - preempt_enable(); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * It can find out the THP large page, or + * HUGETLB page in tlb_flush when THP disabled + */ static inline unsigned long has_large_page(struct mm_struct *mm, unsigned long start, unsigned long end) { @@ -339,68 +326,61 @@ static inline unsigned long has_large_page(struct mm_struct *mm, } return 0; } -#else -static inline unsigned long has_large_page(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - return 0; -} -#endif -void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm; - if (vma->vm_flags & VM_HUGETLB || tlb_flushall_shift == -1) { -flush_all: - flush_tlb_mm(vma->vm_mm); - return; - } +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag) +{ + unsigned long addr; + unsigned act_entries, tlb_entries = 0; preempt_disable(); - mm = vma->vm_mm; - if (current->active_mm == mm) { - if (current->mm) { - unsigned long addr, vmflag = vma->vm_flags; - unsigned act_entries, tlb_entries = 0; + if (current->active_mm != mm) + goto flush_all; - if (vmflag & VM_EXEC) - tlb_entries = tlb_lli_4k[ENTRIES]; - else - tlb_entries = tlb_lld_4k[ENTRIES]; - - act_entries = tlb_entries > mm->total_vm ? - mm->total_vm : tlb_entries; + if (!current->mm) { + leave_mm(smp_processor_id()); + goto flush_all; + } - if ((end - start) >> PAGE_SHIFT > - act_entries >> tlb_flushall_shift) - local_flush_tlb(); - else { - if (has_large_page(mm, start, end)) { - preempt_enable(); - goto flush_all; - } - for (addr = start; addr < end; - addr += PAGE_SIZE) - __flush_tlb_single(addr); + if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 + || vmflag == VM_HUGETLB) { + local_flush_tlb(); + goto flush_all; + } - if (cpumask_any_but(mm_cpumask(mm), - smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, - start, end); - preempt_enable(); - return; - } - } else { - leave_mm(smp_processor_id()); + /* In modern CPU, last level tlb used for both data/ins */ + if (vmflag & VM_EXEC) + tlb_entries = tlb_lli_4k[ENTRIES]; + else + tlb_entries = tlb_lld_4k[ENTRIES]; + /* Assume all of TLB entries was occupied by this task */ + act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; + + /* tlb_flushall_shift is on balance point, details in commit log */ + if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + local_flush_tlb(); + else { + if (has_large_page(mm, start, end)) { + local_flush_tlb(); + goto flush_all; } + /* flush range by one by one 'invlpg' */ + for (addr = start; addr < end; addr += PAGE_SIZE) + __flush_tlb_single(addr); + + if (cpumask_any_but(mm_cpumask(mm), + smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, start, end); + preempt_enable(); + return; } + +flush_all: if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } - void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) { struct mm_struct *mm = vma->vm_mm; -- cgit v1.2.3-70-g09d2 From 52aec3308db85f4e9f5c8b9f5dc4fbd0138c6fa4 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:23 +0800 Subject: x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR There are 32 INVALIDATE_TLB_VECTOR now in kernel. That is quite big amount of vector in IDT. But it is still not enough, since modern x86 sever has more cpu number. That still causes heavy lock contention in TLB flushing. The patch using generic smp call function to replace it. That saved 32 vector number in IDT, and resolved the lock contention in TLB flushing on large system. In the NHM EX machine 4P * 8cores * HT = 64 CPUs, hackbench pthread has 3% performance increase. Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-9-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/entry_arch.h | 9 -- arch/x86/include/asm/irq_vectors.h | 11 -- arch/x86/kernel/entry_64.S | 18 --- arch/x86/kernel/irqinit.c | 73 ----------- arch/x86/mm/tlb.c | 242 +++++++------------------------------ 5 files changed, 47 insertions(+), 306 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 0baa628e330..40afa0005c6 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -15,15 +15,6 @@ BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) - -.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 -.if NUM_INVALIDATE_TLB_VECTORS > \idx -BUILD_INTERRUPT3(invalidate_interrupt\idx, - (INVALIDATE_TLB_VECTOR_START)+\idx, - smp_invalidate_interrupt) -.endif -.endr #endif BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4b4448761e8..1508e518c7e 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -119,17 +119,6 @@ */ #define LOCAL_TIMER_VECTOR 0xef -/* up to 32 vectors used for spreading out TLB flushes: */ -#if NR_CPUS <= 32 -# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS) -#else -# define NUM_INVALIDATE_TLB_VECTORS (32) -#endif - -#define INVALIDATE_TLB_VECTOR_END (0xee) -#define INVALIDATE_TLB_VECTOR_START \ - (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1) - #define NR_VECTORS 256 #define FPU_IRQ 13 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133b51b..bcf28e1ce1a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1048,24 +1048,6 @@ apicinterrupt LOCAL_TIMER_VECTOR \ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi -#ifdef CONFIG_SMP - ALIGN - INTR_FRAME -.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 -.if NUM_INVALIDATE_TLB_VECTORS > \idx -ENTRY(invalidate_interrupt\idx) - pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) - jmp .Lcommon_invalidate_interrupt0 - CFI_ADJUST_CFA_OFFSET -8 -END(invalidate_interrupt\idx) -.endif -.endr - CFI_ENDPROC -apicinterrupt INVALIDATE_TLB_VECTOR_START, \ - invalidate_interrupt0, smp_invalidate_interrupt -#endif - apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 252981afd6c..6e03b0d6913 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -171,79 +171,6 @@ static void __init smp_intr_init(void) */ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - /* IPIs for invalidation */ -#define ALLOC_INVTLB_VEC(NR) \ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ - invalidate_interrupt##NR) - - switch (NUM_INVALIDATE_TLB_VECTORS) { - default: - ALLOC_INVTLB_VEC(31); - case 31: - ALLOC_INVTLB_VEC(30); - case 30: - ALLOC_INVTLB_VEC(29); - case 29: - ALLOC_INVTLB_VEC(28); - case 28: - ALLOC_INVTLB_VEC(27); - case 27: - ALLOC_INVTLB_VEC(26); - case 26: - ALLOC_INVTLB_VEC(25); - case 25: - ALLOC_INVTLB_VEC(24); - case 24: - ALLOC_INVTLB_VEC(23); - case 23: - ALLOC_INVTLB_VEC(22); - case 22: - ALLOC_INVTLB_VEC(21); - case 21: - ALLOC_INVTLB_VEC(20); - case 20: - ALLOC_INVTLB_VEC(19); - case 19: - ALLOC_INVTLB_VEC(18); - case 18: - ALLOC_INVTLB_VEC(17); - case 17: - ALLOC_INVTLB_VEC(16); - case 16: - ALLOC_INVTLB_VEC(15); - case 15: - ALLOC_INVTLB_VEC(14); - case 14: - ALLOC_INVTLB_VEC(13); - case 13: - ALLOC_INVTLB_VEC(12); - case 12: - ALLOC_INVTLB_VEC(11); - case 11: - ALLOC_INVTLB_VEC(10); - case 10: - ALLOC_INVTLB_VEC(9); - case 9: - ALLOC_INVTLB_VEC(8); - case 8: - ALLOC_INVTLB_VEC(7); - case 7: - ALLOC_INVTLB_VEC(6); - case 6: - ALLOC_INVTLB_VEC(5); - case 5: - ALLOC_INVTLB_VEC(4); - case 4: - ALLOC_INVTLB_VEC(3); - case 3: - ALLOC_INVTLB_VEC(2); - case 2: - ALLOC_INVTLB_VEC(1); - case 1: - ALLOC_INVTLB_VEC(0); - break; - } - /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 481737def84..2b5f506a765 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,34 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) * * More scalable flush, from Andi Kleen * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into - * the right array slot for the flush data. - * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. - * In future when interrupts are split into per CPU domains this could be - * fixed, at the cost of triggering multiple IPIs in some cases. + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ -union smp_flush_state { - struct { - struct mm_struct *flush_mm; - unsigned long flush_start; - unsigned long flush_end; - raw_spinlock_t tlbstate_lock; - DECLARE_BITMAP(flush_cpumask, NR_CPUS); - }; - char pad[INTERNODE_CACHE_BYTES]; -} ____cacheline_internodealigned_in_smp; - -/* State is put into the per CPU data section, but padded - to a full cache line because other CPUs can access it and we don't - want false sharing in the per cpu data segment. */ -static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; - -static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); +struct flush_tlb_info { + struct mm_struct *flush_mm; + unsigned long flush_start; + unsigned long flush_end; +}; /* * We cannot call mmdrop() because we are in interrupt context, @@ -74,28 +54,25 @@ void leave_mm(int cpu) EXPORT_SYMBOL_GPL(leave_mm); /* - * * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu active_mm + * 1a1) set cpu_tlbstate to TLBSTATE_OK + * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm + * if cpu0 was in lazy tlb mode. + * 1a2) update cpu active_mm * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask); * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. + * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but flush_tlb_func ignore flush ipis for the wrong + * mm, and in the worst case we perform a superfluous tlb flush. * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK + * cpu active_mm is correct, cpu0 already handles flush ipis. + * 1b1) set cpu_tlbstate to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. * Atomically set the bit [other cpus will start sending flush ipis], * and test the bit. @@ -108,186 +85,61 @@ EXPORT_SYMBOL_GPL(leave_mm); * runs in kernel space, the cpu could load tlb entries for user space * pages. * - * The good news is that cpu mmu_state is local to each cpu, no + * The good news is that cpu_tlbstate is local to each cpu, no * write/read ordering problems. */ /* - * TLB flush IPI: - * + * TLB flush funcation: * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -/* - * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop - * but still used for documentation purpose but the usage is slightly - * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt - * entry calls in with the first parameter in %eax. Maybe define - * intrlinkage? */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void smp_invalidate_interrupt(struct pt_regs *regs) +static void flush_tlb_func(void *info) { - unsigned int cpu; - unsigned int sender; - union smp_flush_state *f; - - cpu = smp_processor_id(); - /* - * orig_rax contains the negated interrupt vector. - * Use that to determine where the sender put the data. - */ - sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; - f = &flush_state[sender]; - - if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) { - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL - || !cpu_has_invlpg) - local_flush_tlb(); - else if (!f->flush_end) - __flush_tlb_single(f->flush_start); - else { - unsigned long addr; - addr = f->flush_start; - while (addr < f->flush_end) { - __flush_tlb_single(addr); - addr += PAGE_SIZE; - } - } - } else - leave_mm(cpu); - } -out: - ack_APIC_irq(); - smp_mb__before_clear_bit(); - cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); - smp_mb__after_clear_bit(); - inc_irq_stat(irq_tlb_count); -} + struct flush_tlb_info *f = info; -static void flush_tlb_others_ipi(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - unsigned int sender; - union smp_flush_state *f; - - /* Caller has disabled preemption */ - sender = this_cpu_read(tlb_vector_offset); - f = &flush_state[sender]; - - if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) - raw_spin_lock(&f->tlbstate_lock); - - f->flush_mm = mm; - f->flush_start = start; - f->flush_end = end; - if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { - /* - * We have to send the IPI only to - * CPUs affected. - */ - apic->send_IPI_mask(to_cpumask(f->flush_cpumask), - INVALIDATE_TLB_VECTOR_START + sender); - - while (!cpumask_empty(to_cpumask(f->flush_cpumask))) - cpu_relax(); - } + if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + return; + + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg) + local_flush_tlb(); + else if (!f->flush_end) + __flush_tlb_single(f->flush_start); + else { + unsigned long addr; + addr = f->flush_start; + while (addr < f->flush_end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; + } + } + } else + leave_mm(smp_processor_id()); - f->flush_mm = NULL; - f->flush_start = 0; - f->flush_end = 0; - if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) - raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end) { + struct flush_tlb_info info; + info.flush_mm = mm; + info.flush_start = start; + info.flush_end = end; + if (is_uv_system()) { unsigned int cpu; cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); if (cpumask) - flush_tlb_others_ipi(cpumask, mm, start, end); + smp_call_function_many(cpumask, flush_tlb_func, + &info, 1); return; } - flush_tlb_others_ipi(cpumask, mm, start, end); -} - -static void __cpuinit calculate_tlb_offset(void) -{ - int cpu, node, nr_node_vecs, idx = 0; - /* - * we are changing tlb_vector_offset for each CPU in runtime, but this - * will not cause inconsistency, as the write is atomic under X86. we - * might see more lock contentions in a short time, but after all CPU's - * tlb_vector_offset are changed, everything should go normal - * - * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might - * waste some vectors. - **/ - if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) - nr_node_vecs = 1; - else - nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; - - for_each_online_node(node) { - int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * - nr_node_vecs; - int cpu_offset = 0; - for_each_cpu(cpu, cpumask_of_node(node)) { - per_cpu(tlb_vector_offset, cpu) = node_offset + - cpu_offset; - cpu_offset++; - cpu_offset = cpu_offset % nr_node_vecs; - } - idx++; - } -} - -static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - switch (action & 0xf) { - case CPU_ONLINE: - case CPU_DEAD: - calculate_tlb_offset(); - } - return NOTIFY_OK; -} - -static int __cpuinit init_smp_flush(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(flush_state); i++) - raw_spin_lock_init(&flush_state[i].tlbstate_lock); - - calculate_tlb_offset(); - hotcpu_notifier(tlb_cpuhp_notify, 0); - return 0; + smp_call_function_many(cpumask, flush_tlb_func, &info, 1); } -core_initcall(init_smp_flush); void flush_tlb_current_task(void) { -- cgit v1.2.3-70-g09d2 From effee4b9b3b0aa5770bcd98de5f672b05b27703c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 28 Jun 2012 09:02:24 +0800 Subject: x86/tlb: do flush_tlb_kernel_range by 'invlpg' This patch do flush_tlb_kernel_range by 'invlpg'. The performance pay and gain was analyzed in previous patch (x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range). In the testing: http://lkml.org/lkml/2012/6/21/10 The pay is mostly covered by long kernel path, but the gain is still quite clear, memory access in user APP can increase 30+% when kernel execute this funtion. Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1340845344-27557-10-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/tlbflush.h | 13 +++++++------ arch/x86/mm/tlb.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 621b959e1db..b5a27bd7766 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -123,6 +123,12 @@ static inline void reset_lazy_tlbstate(void) { } +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + flush_tlb_all(); +} + #else /* SMP */ #include @@ -139,6 +145,7 @@ extern void flush_tlb_current_task(void); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); +extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); #define flush_tlb() flush_tlb_current_task() @@ -168,10 +175,4 @@ static inline void reset_lazy_tlbstate(void) native_flush_tlb_others(mask, mm, start, end) #endif -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) -{ - flush_tlb_all(); -} - #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 2b5f506a765..613cd83e8c0 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -264,6 +264,36 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } +static void do_kernel_range_flush(void *info) +{ + struct flush_tlb_info *f = info; + unsigned long addr; + + /* flush range by one by one 'invlpg' */ + for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) + __flush_tlb_single(addr); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + unsigned act_entries; + struct flush_tlb_info info; + + /* In modern CPU, last level tlb used for both data/ins */ + act_entries = tlb_lld_4k[ENTRIES]; + + /* Balance as user space task's flush, a bit conservative */ + if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || + (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + + on_each_cpu(do_flush_tlb_all, NULL, 1); + else { + info.flush_start = start; + info.flush_end = end; + on_each_cpu(do_kernel_range_flush, &info, 1); + } +} + #ifdef CONFIG_DEBUG_TLBFLUSH static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) -- cgit v1.2.3-70-g09d2 From b102f1d0f1cd0bb5ec82e5aeb1e33502d6ad6710 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Jun 2012 09:41:39 +0900 Subject: tracing/kvm: Use __print_hex() for kvm_emulate_insn tracepoint The kvm_emulate_insn tracepoint used __print_insn() for printing its instructions. However it makes the format of the event hard to parse as it reveals TP internals. Fortunately, kernel provides __print_hex for almost same purpose, we can use it instead of open coding it. The user-space can be changed to parse it later. That means raw kernel tracing will not be affected by this change: # cd /sys/kernel/debug/tracing/ # cat events/kvm/kvm_emulate_insn/format name: kvm_emulate_insn ID: 29 format: ... print fmt: "%x:%llx:%s (%s)%s", REC->csbase, REC->rip, __print_hex(REC->insn, REC->len), \ __print_symbolic(REC->flags, { 0, "real" }, { (1 << 0) | (1 << 1), "vm16" }, \ { (1 << 0), "prot16" }, { (1 << 0) | (1 << 2), "prot32" }, { (1 << 0) | (1 << 3), "prot64" }), \ REC->failed ? " failed" : "" # echo 1 > events/kvm/kvm_emulate_insn/enable # cat trace # tracer: nop # # entries-in-buffer/entries-written: 2183/2183 #P:12 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | qemu-kvm-1782 [002] ...1 140.931636: kvm_emulate_insn: 0:c102fa25:89 10 (prot32) qemu-kvm-1781 [004] ...1 140.931637: kvm_emulate_insn: 0:c102fa25:89 10 (prot32) Link: http://lkml.kernel.org/n/tip-wfw6y3b9ugtey8snaow9nmg5@git.kernel.org Link: http://lkml.kernel.org/r/1340757701-10711-2-git-send-email-namhyung@kernel.org Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Namhyung Kim Cc: kvm@vger.kernel.org Acked-by: Avi Kivity Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- arch/x86/kvm/trace.h | 12 +----------- include/trace/ftrace.h | 1 + 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14..62d02e3c3ed 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -710,16 +710,6 @@ TRACE_EVENT(kvm_skinit, __entry->rip, __entry->slb) ); -#define __print_insn(insn, ilen) ({ \ - int i; \ - const char *ret = p->buffer + p->len; \ - \ - for (i = 0; i < ilen; ++i) \ - trace_seq_printf(p, " %02x", insn[i]); \ - trace_seq_printf(p, "%c", 0); \ - ret; \ - }) - #define KVM_EMUL_INSN_F_CR0_PE (1 << 0) #define KVM_EMUL_INSN_F_EFL_VM (1 << 1) #define KVM_EMUL_INSN_F_CS_D (1 << 2) @@ -786,7 +776,7 @@ TRACE_EVENT(kvm_emulate_insn, TP_printk("%x:%llx:%s (%s)%s", __entry->csbase, __entry->rip, - __print_insn(__entry->insn, __entry->len), + __print_hex(__entry->insn, __entry->len), __print_symbolic(__entry->flags, kvm_trace_symbol_emul_flags), __entry->failed ? " failed" : "" diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 769724944fc..c6bc2faaf26 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -571,6 +571,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_flags #undef __print_symbolic +#undef __print_hex #undef __get_dynamic_array #undef __get_str -- cgit v1.2.3-70-g09d2 From 954e482bde20b0e208fd4d34ef26e10afd194600 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 24 May 2012 18:19:45 -0700 Subject: x86/copy_user_generic: Optimize copy_user_generic with CPU erms feature According to Intel 64 and IA-32 SDM and Optimization Reference Manual, beginning with Ivybridge, REG string operation using MOVSB and STOSB can provide both flexible and high-performance REG string operations in cases like memory copy. Enhancement availability is indicated by CPUID.7.0.EBX[9] (Enhanced REP MOVSB/ STOSB). If CPU erms feature is detected, patch copy_user_generic with enhanced fast string version of copy_user_generic. A few new macros are defined to reduce duplicate code in ALTERNATIVE and ALTERNATIVE_2. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1337908785-14015-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 74 ++++++++++++++++++++++++++++++-------- arch/x86/include/asm/uaccess_64.h | 11 +++++- arch/x86/kernel/x8664_ksyms_64.c | 1 + 3 files changed, 70 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 49331bedc15..70780689599 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" + +#define b_replacement(number) "663"#number +#define e_replacement(number) "664"#number + +#define alt_slen "662b-661b" +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" + +#define ALTINSTR_ENTRY(feature, number) \ + " .long 661b - .\n" /* label */ \ + " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ + " .byte " alt_slen "\n" /* source len */ \ + " .byte " alt_rlen(number) "\n" /* replacement len */ + +#define DISCARD_ENTRY(number) /* rlen <= slen */ \ + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ + b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" + /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - \ - "661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .long 661b - .\n" /* label */ \ - " .long 663f - .\n" /* new instruction */ \ - " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ - " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature, 1) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ".previous" + +#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + DISCARD_ENTRY(2) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ".previous" /* * This must be included *after* the definition of ALTERNATIVE due to @@ -139,6 +170,19 @@ static inline int alternatives_text_reserved(void *start, void *end) asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) +/* + * Like alternative_call, but there are two features and respective functions. + * If CPU has feature2, function2 is used. + * Otherwise, if CPU has feature1, function1 is used. + * Otherwise, old function is used. + */ +#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ + output, input...) \ + asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ + "call %P[new2]", feature2) \ + : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input) + /* * use this macro(s) if you need more than one output parameter * in alternative_io diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8e796fbbf9c..d8def8b3dba 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -17,6 +17,8 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long +copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); +__must_check unsigned long copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); @@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - alternative_call(copy_user_generic_unrolled, + /* + * If CPU has ERMS feature, use copy_user_enhanced_fast_string. + * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. + * Otherwise, use copy_user_generic_unrolled. + */ + alternative_call_2(copy_user_generic_unrolled, copy_user_generic_string, X86_FEATURE_REP_GOOD, + copy_user_enhanced_fast_string, + X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 9796c2f3d07..6020f6f5927 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic_string); EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); -- cgit v1.2.3-70-g09d2 From c9fc3f778a6a215ace14ee556067c73982b6d40f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 21 Jun 2012 14:07:16 +0200 Subject: x86, microcode: Sanitize per-cpu microcode reloading interface Microcode reloading in a per-core manner is a very bad idea for both major x86 vendors. And the thing is, we have such interface with which we can end up with different microcode versions applied on different cores of an otherwise homogeneous wrt (family,model,stepping) system. So turn off the possibility of doing that per core and allow it only system-wide. This is a minimal fix which we'd like to see in stable too thus the more-or-less arbitrary decision to allow system-wide reloading only on the BSP: $ echo 1 > /sys/devices/system/cpu/cpu0/microcode/reload ... and disable the interface on the other cores: $ echo 1 > /sys/devices/system/cpu/cpu23/microcode/reload -bash: echo: write error: Invalid argument Also, allowing the reload only from one CPU (the BSP in that case) doesn't allow the reload procedure to degenerate into an O(n^2) deal when triggering reloads from all /sys/devices/system/cpu/cpuX/microcode/reload sysfs nodes simultaneously. A more generic fix will follow. Cc: Henrique de Moraes Holschuh Cc: Peter Zijlstra Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1340280437-7718-2-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/microcode_core.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc691718..24b852b61be 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -298,19 +298,31 @@ static ssize_t reload_store(struct device *dev, const char *buf, size_t size) { unsigned long val; - int cpu = dev->id; - ssize_t ret = 0; + int cpu; + ssize_t ret = 0, tmp_ret; + + /* allow reload only from the BSP */ + if (boot_cpu_data.cpu_index != dev->id) + return -EINVAL; ret = kstrtoul(buf, 0, &val); if (ret) return ret; - if (val == 1) { - get_online_cpus(); - if (cpu_online(cpu)) - ret = reload_for_cpu(cpu); - put_online_cpus(); + if (val != 1) + return size; + + get_online_cpus(); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; } + put_online_cpus(); if (!ret) ret = size; -- cgit v1.2.3-70-g09d2 From 3d8986bc7f309483ee09c7a02888bab09072c19b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 21 Jun 2012 14:07:17 +0200 Subject: x86, microcode: Make reload interface per system The reload interface should be per-system so that a full system ucode reload happens (on each core) when doing echo 1 > /sys/devices/system/cpu/microcode/reload Move it to the cpu subsys directory instead of it being per-cpu. Cc: Henrique de Moraes Holschuh Cc: Peter Zijlstra Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1340280437-7718-3-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_core.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 24b852b61be..947e4c64b1d 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -301,10 +301,6 @@ static ssize_t reload_store(struct device *dev, int cpu; ssize_t ret = 0, tmp_ret; - /* allow reload only from the BSP */ - if (boot_cpu_data.cpu_index != dev->id) - return -EINVAL; - ret = kstrtoul(buf, 0, &val); if (ret) return ret; @@ -351,7 +347,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL); static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &dev_attr_reload.attr, &dev_attr_version.attr, &dev_attr_processor_flags.attr, NULL @@ -528,6 +523,16 @@ static const struct x86_cpu_id microcode_id[] = { MODULE_DEVICE_TABLE(x86cpu, microcode_id); #endif +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); @@ -559,9 +564,17 @@ static int __init microcode_init(void) if (error) goto out_pdev; + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + error = microcode_dev_init(); if (error) - goto out_driver; + goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); @@ -571,7 +584,11 @@ static int __init microcode_init(void) return 0; -out_driver: + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: get_online_cpus(); mutex_lock(µcode_mutex); @@ -580,7 +597,7 @@ out_driver: mutex_unlock(µcode_mutex); put_online_cpus(); -out_pdev: + out_pdev: platform_device_unregister(microcode_pdev); return error; @@ -596,6 +613,9 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); unregister_syscore_ops(&mc_syscore_ops); + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + get_online_cpus(); mutex_lock(µcode_mutex); -- cgit v1.2.3-70-g09d2 From f9808b7fd422b965cea52e05ba470e0a473c53d3 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 1 Jul 2012 18:05:06 +0300 Subject: apic: fix kvm build on UP without IOAPIC On UP i386, when APIC is disabled # CONFIG_X86_UP_APIC is not set # CONFIG_PCI_IOAPIC is not set code looking at apicdrivers never has any effect but it still gets compiled in. In particular, this causes build failures with kvm, but it generally bloats the kernel unnecessarily. Fix by defining both __apicdrivers and __apicdrivers_end to be NULL when CONFIG_X86_LOCAL_APIC is unset: I verified that as the result any loop scanning __apicdrivers gets optimized out by the compiler. Warning: a .config with apic disabled doesn't seem to boot for me (even without this patch). Still verifying why, meanwhile this patch is compile-tested only. Signed-off-by: Michael S. Tsirkin Reported-by: Randy Dunlap Acked-by: Randy Dunlap Acked-by: H. Peter Anvin Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/apic.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eaff4790ed9..aa5b2eec360 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -417,7 +417,12 @@ extern struct apic *apic; __aligned(sizeof(struct apic *)) \ __section(.apicdrivers) = { &sym1, &sym2 } +#ifdef CONFIG_X86_LOCAL_APIC extern struct apic *__apicdrivers[], *__apicdrivers_end[]; +#else +#define __apicdrivers ((struct apic **)NULL) +#define __apicdrivers_end ((struct apic **)NULL) +#endif /* * APIC functionality to boot other CPUs - only used on SMP: -- cgit v1.2.3-70-g09d2 From 2106a548122ef0557dc51eae4f3f1a538cebfa79 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Fri, 15 Jun 2012 11:31:56 +0800 Subject: KVM: VMX: code clean for vmx_init() Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eeeb4a25aed..e10ec0e4d1c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7290,23 +7290,21 @@ static int __init vmx_init(void) if (!vmx_io_bitmap_a) return -ENOMEM; + r = -ENOMEM; + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) { - r = -ENOMEM; + if (!vmx_io_bitmap_b) goto out; - } vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) { - r = -ENOMEM; + if (!vmx_msr_bitmap_legacy) goto out1; - } + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) { - r = -ENOMEM; + if (!vmx_msr_bitmap_longmode) goto out2; - } + /* * Allow direct access to the PC debug port (it is often used for I/O -- cgit v1.2.3-70-g09d2 From ce5c1fe9a9e059b5c58f0a7e2a3e687d0efac815 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Jun 2012 11:11:38 +0200 Subject: perf/x86: Fix USER/KERNEL tagging of samples Several perf interrupt handlers (PEBS,IBS,BTS) re-write regs->ip but do not update the segment registers. So use an regs->ip based test instead of an regs->cs/regs->flags based test. Reported-and-tested-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-xxrt0a1zronm1sm36obwc2vy@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c4706cf9c01..6ef9d41b87f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1863,7 +1863,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs) else misc |= PERF_RECORD_MISC_GUEST_KERNEL; } else { - if (user_mode(regs)) + if (!kernel_ip(regs->ip)) misc |= PERF_RECORD_MISC_USER; else misc |= PERF_RECORD_MISC_KERNEL; -- cgit v1.2.3-70-g09d2 From 15c7ad51ad58cbd3b46112c1840bc7228bd354bf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:33 +0200 Subject: perf/x86: Rename Intel specific macros There are macros that are Intel specific and not x86 generic. Rename them into INTEL_*. This patch removes X86_PMC_IDX_GENERIC and does: $ sed -i -e 's/X86_PMC_MAX_/INTEL_PMC_MAX_/g' \ arch/x86/include/asm/kvm_host.h \ arch/x86/include/asm/perf_event.h \ arch/x86/kernel/cpu/perf_event.c \ arch/x86/kernel/cpu/perf_event_p4.c \ arch/x86/kvm/pmu.c $ sed -i -e 's/X86_PMC_IDX_FIXED/INTEL_PMC_IDX_FIXED/g' \ arch/x86/include/asm/perf_event.h \ arch/x86/kernel/cpu/perf_event.c \ arch/x86/kernel/cpu/perf_event_intel.c \ arch/x86/kernel/cpu/perf_event_intel_ds.c \ arch/x86/kvm/pmu.c $ sed -i -e 's/X86_PMC_MSK_/INTEL_PMC_MSK_/g' \ arch/x86/include/asm/perf_event.h \ arch/x86/kernel/cpu/perf_event.c Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/include/asm/perf_event.h | 17 +++++++------- arch/x86/kernel/cpu/perf_event.c | 38 +++++++++++++++---------------- arch/x86/kernel/cpu/perf_event_intel.c | 14 ++++++------ arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 ++-- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- arch/x86/kvm/pmu.c | 22 +++++++++--------- 7 files changed, 50 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db7c1f2709a..2da88c0cda1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -313,8 +313,8 @@ struct kvm_pmu { u64 counter_bitmask[2]; u64 global_ctrl_mask; u8 version; - struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; + struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; struct irq_work irq_work; u64 reprogram_pmi; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 588f52ea810..3b31248caf6 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -5,11 +5,10 @@ * Performance event hw details: */ -#define X86_PMC_MAX_GENERIC 32 -#define X86_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_GENERIC 32 +#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 @@ -121,16 +120,16 @@ struct x86_pmu_capability { /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) -#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) +#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) +#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) /* * We model BTS tracing as another fixed-mode PMC. @@ -139,7 +138,7 @@ struct x86_pmu_capability { * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) /* * IBS cpuid feature detection diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e677d9923f4..66805000260 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -63,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event) int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -626,8 +626,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ - if (c->idxmsk64 & (~0ULL << X86_PMC_IDX_FIXED)) { - idx = X86_PMC_IDX_FIXED; + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; @@ -635,7 +635,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } @@ -813,13 +813,13 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (hwc->idx - X86_PMC_IDX_FIXED) | 1<<30; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); @@ -921,7 +921,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -1338,21 +1338,21 @@ static int __init init_hw_perf_events(void) for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; } x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1368,7 +1368,7 @@ static int __init init_hw_perf_events(void) */ for_each_event_constraint(c, x86_pmu.event_constraints) { if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { continue; } @@ -1611,8 +1611,8 @@ static int x86_pmu_event_idx(struct perf_event *event) if (!x86_pmu.attr_rdpmc) return 0; - if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { - idx -= X86_PMC_IDX_FIXED; + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; idx |= 1 << 30; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8408e37f5fa..5b0b362c7ae 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -747,7 +747,7 @@ static void intel_pmu_disable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); intel_pmu_pebs_disable_all(); @@ -763,9 +763,9 @@ static void intel_pmu_enable_all(int added) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; if (WARN_ON_ONCE(!event)) return; @@ -871,7 +871,7 @@ static inline void intel_pmu_ack_status(u64 ack) static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -886,7 +886,7 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; @@ -915,7 +915,7 @@ static void intel_pmu_disable_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; /* @@ -949,7 +949,7 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) return; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 026373edef7..629ae0b7ad9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -248,7 +248,7 @@ void reserve_ds_buffers(void) */ struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); void intel_pmu_enable_bts(u64 config) { @@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void) u64 to; u64 flags; }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; struct perf_output_handle handle; struct perf_event_header header; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 6c82e403798..92c7e39a079 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void) unsigned int low, high; /* If we get stripped -- indexing fails */ - BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); if (!(low & (1 << 7))) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 2e88438ffd8..9b7ec1150ab 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) { - if (idx < X86_PMC_IDX_FIXED) + if (idx < INTEL_PMC_IDX_FIXED) return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); else - return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); + return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); } void kvm_deliver_pmi(struct kvm_vcpu *vcpu) @@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx) if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int fidx = idx - X86_PMC_IDX_FIXED; + int fidx = idx - INTEL_PMC_IDX_FIXED; reprogram_fixed_counter(pmc, fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); } @@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) return; pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - X86_PMC_MAX_GENERIC); + INTEL_PMC_MAX_GENERIC); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; bitmap_len = (entry->eax >> 24) & 0xff; @@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), - X86_PMC_MAX_FIXED); + INTEL_PMC_MAX_FIXED); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; } pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; } @@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) { + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; } init_irq_work(&pmu->irq_work, trigger_pmi); kvm_pmu_cpuid_update(vcpu); @@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) int i; irq_work_sync(&pmu->irq_work); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) stop_counter(&pmu->fixed_counters[i]); pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = -- cgit v1.2.3-70-g09d2 From a1eac7ac903ea9afbd4f133659710a0588c8eca5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:34 +0200 Subject: perf/x86: Move Intel specific code to intel_pmu_init() There is some Intel specific code in the generic x86 path. Move it to intel_pmu_init(). Since p4 and p6 pmus don't have fixed counters we may skip the check in case such a pmu is detected. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 34 ++-------------------------------- arch/x86/kernel/cpu/perf_event_intel.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66805000260..7b4f1e871f7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1307,7 +1307,6 @@ static struct attribute_group x86_pmu_format_group = { static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; - struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1338,21 +1337,8 @@ static int __init init_hw_perf_events(void) for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); - x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1361,22 +1347,6 @@ static int __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0); - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { - continue; - } - - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; - } - } - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5b0b362c7ae..2e9444c8014 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1765,6 +1765,7 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; + struct event_constraint *c; unsigned int unused; int version; @@ -1953,5 +1954,37 @@ __init int intel_pmu_init(void) } } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + return 0; } -- cgit v1.2.3-70-g09d2 From b1dc3c4820428ac6216537416b2fcd140fdc52e5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:35 +0200 Subject: perf/x86/amd: Unify AMD's generic and family 15h pmus There is no need for keeping separate pmu structs. We can enable amd_{get,put}_event_constraints() functions also for family 15h event. The advantage is that there is only a single pmu struct for all AMD cpus. This patch introduces functions to setup the pmu to enabe core performance counters or counter constraints. Also, cpuid checks are used instead of family checks where possible. Thus, it enables the code independently of cpu families if the feature flag is set. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-4-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 3 +- arch/x86/kernel/cpu/perf_event_amd.c | 103 ++++++++++++++++------------------- arch/x86/oprofile/op_model_amd.c | 4 +- 3 files changed, 49 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 3b31248caf6..ffdf5e0991c 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -47,8 +47,7 @@ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) #define AMD64_NUM_COUNTERS 4 -#define AMD64_NUM_COUNTERS_F15H 6 -#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H +#define AMD64_NUM_COUNTERS_CORE 6 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 11a4eb9131d..4528ae7b6ec 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; - if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) + if (boot_cpu_data.x86_max_cores < 2) return; nb_id = amd_get_nb_id(cpu); @@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = { NULL, }; -static __initconst const struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = amd_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, - .cntval_bits = 48, - .cntval_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints, - - .format_attrs = amd_format_attr, - - .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, - .cpu_dead = amd_pmu_cpu_dead, -}; - /* AMD Family 15h */ #define AMD_EVENT_TYPE_MASK 0x000000F0ULL @@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } -static __initconst const struct x86_pmu amd_pmu_f15h = { - .name = "AMD Family 15h", +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, .enable_all = x86_pmu_enable_all, @@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .disable = x86_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, - .eventsel = MSR_F15H_PERF_CTL, - .perfctr = MSR_F15H_PERF_CTR, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS_F15H, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints_f15h, - /* nortbridge counters not yet implemented: */ -#if 0 + .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_dead = amd_pmu_cpu_dead, -#endif .cpu_starting = amd_pmu_cpu_starting, - .format_attrs = amd_format_attr, + .cpu_dead = amd_pmu_cpu_dead, }; +static int setup_event_constraints(void) +{ + if (boot_cpu_data.x86 >= 0x15) + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + return 0; +} + +static int setup_perfctr_core(void) +{ + if (!cpu_has_perfctr_core) { + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, + KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); + return -ENODEV; + } + + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, + KERN_ERR "hw perf events core counters need constraints handler!"); + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * x86_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + printk(KERN_INFO "perf: AMD core performance counters detected\n"); + + return 0; +} + __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - /* - * If core performance counter extensions exists, it must be - * family 15h, otherwise fail. See x86_pmu_addr_offset(). - */ - switch (boot_cpu_data.x86) { - case 0x15: - if (!cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu_f15h; - break; - default: - if (cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu; - break; - } + x86_pmu = amd_pmu; + + setup_event_constraints(); + setup_perfctr_core(); /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 303f0863782..b2b94438ff0 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - if (num_counters == AMD64_NUM_COUNTERS_F15H) { + if (num_counters == AMD64_NUM_COUNTERS_CORE) { msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); } else { @@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops) ops->create_files = setup_ibs_files; if (boot_cpu_data.x86 == 0x15) { - num_counters = AMD64_NUM_COUNTERS_F15H; + num_counters = AMD64_NUM_COUNTERS_CORE; } else { num_counters = AMD64_NUM_COUNTERS; } -- cgit v1.2.3-70-g09d2 From f285f92f7e4c9af20149130c8fd5027131b39b0e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Jun 2012 20:46:36 +0200 Subject: perf/x86: Improve debug output in check_hw_exists() It might be of interest which perfctr msr failed. Signed-off-by: Robert Richter [ added hunk to avoid GCC warn ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340217996-2254-5-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 7b4f1e871f7..3eb88ebcec5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -178,7 +178,7 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_new = 0; + u64 val, val_new = ~0; int i, reg, ret = 0; /* @@ -211,8 +211,9 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = wrmsrl_safe(x86_pmu_event_addr(0), val); - ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + reg = x86_pmu_event_addr(0); + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; @@ -229,6 +230,7 @@ bios_fail: msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); return false; } -- cgit v1.2.3-70-g09d2 From c93dc84cbe32435be3ffa2fbde355eff94955c32 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jun 2012 14:50:50 +0200 Subject: perf/x86: Add a microcode revision check for SNB-PEBS Recent Intel microcode resolved the SNB-PEBS issues, so conditionally enable PEBS on SNB hardware depending on the microcode revision. Thanks to Stephane for figuring out the various microcode revisions. Suggested-by: Stephane Eranian Acked-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-v3672ziwh9damwqwh1uz3krm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 ++ arch/x86/kernel/cpu/perf_event.c | 21 +++++++++----- arch/x86/kernel/cpu/perf_event.h | 4 ++- arch/x86/kernel/cpu/perf_event_intel.c | 51 ++++++++++++++++++++++++++++++++-- arch/x86/kernel/microcode_core.c | 10 +++++-- 5 files changed, 74 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ffdf5e0991c..c78f14a0df0 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -232,6 +232,7 @@ struct perf_guest_switch_msr { extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern void perf_check_microcode(void); #else static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { @@ -245,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) } static inline void perf_events_lapic_init(void) { } +static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3eb88ebcec5..29557aa06dd 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -379,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs_active) { + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ @@ -1650,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void) x86_pmu.flush_branch_stack(); } +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, - .attr_groups = x86_pmu_attr_groups, + .attr_groups = x86_pmu_attr_groups, - .event_init = x86_pmu_event_init, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, .del = x86_pmu_del, @@ -1664,11 +1671,11 @@ static struct pmu pmu = { .stop = x86_pmu_stop, .read = x86_pmu_read, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, - .event_idx = x86_pmu_event_idx, + .event_idx = x86_pmu_event_idx, .flush_branch_stack = x86_pmu_flush_branch_stack, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 83238f2a12b..3f5c6690435 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -361,6 +361,8 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); void (*flush_branch_stack)(void); /* @@ -373,7 +375,7 @@ struct x86_pmu { * Intel DebugStore bits */ int bts, pebs; - int bts_active, pebs_active; + int bts_active, pebs_active, pebs_broken; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2e9444c8014..5fdedb4bc3f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1712,11 +1712,56 @@ static __init void intel_clovertown_quirk(void) x86_pmu.pebs_constraints = NULL; } +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + static __init void intel_sandybridge_quirk(void) { - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); - x86_pmu.pebs = 0; - x86_pmu.pebs_constraints = NULL; + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 947e4c64b1d..1649cf899ad 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -87,6 +87,7 @@ #include #include #include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - mutex_lock(µcode_mutex); if (uci->valid) { enum ucode_state ustate; @@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu) if (ustate == UCODE_ERROR) err = -EINVAL; } - mutex_unlock(µcode_mutex); return err; } @@ -309,6 +308,7 @@ static ssize_t reload_store(struct device *dev, return size; get_online_cpus(); + mutex_lock(µcode_mutex); for_each_online_cpu(cpu) { tmp_ret = reload_for_cpu(cpu); if (tmp_ret != 0) @@ -318,6 +318,9 @@ static ssize_t reload_store(struct device *dev, if (!ret) ret = tmp_ret; } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); put_online_cpus(); if (!ret) @@ -557,7 +560,8 @@ static int __init microcode_init(void) mutex_lock(µcode_mutex); error = subsys_interface_register(&mc_cpu_interface); - + if (!error) + perf_check_microcode(); mutex_unlock(µcode_mutex); put_online_cpus(); -- cgit v1.2.3-70-g09d2 From 3e0091e2b6f8cd59e567f247e345a3a6ad1f6e7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jun 2012 23:38:39 +0200 Subject: perf/x86: Save a few bytes in 'struct x86_pmu' All these are basically boolean flags, use a bitfield to save a few bytes. Suggested-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-vsevd5g8lhcn129n3s7trl7r@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3f5c6690435..a15df4be151 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -374,8 +374,11 @@ struct x86_pmu { /* * Intel DebugStore bits */ - int bts, pebs; - int bts_active, pebs_active, pebs_broken; + int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; -- cgit v1.2.3-70-g09d2 From eca26c9950f4d3e9c92ba275e9f4aee834aa1913 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 27 Jun 2012 15:09:12 +0800 Subject: perf/x86: Use 0xff as pseudo code for fixed uncore event Stephane Eranian suggestted using 0xff as pseudo code for fixed uncore event and using the umask value to determine which of the fixed events we want to map to. So far there is at most one fixed counter in a uncore PMU. So just change the definition of UNCORE_FIXED_EVENT to 0xff. Suggested-by: Stephane Eranian Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340780953-21130-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 4 ++-- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 6f43f9584e3..c42a3f7b523 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -179,7 +179,7 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { }; static struct uncore_event_desc snbep_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), { /* end: all zeroes */ }, @@ -616,7 +616,7 @@ static struct attribute_group nhm_uncore_format_group = { }; static struct uncore_event_desc nhm_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0xff"), + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 4d52db0d1df..88498c7b342 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -9,7 +9,7 @@ #define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) -#define UNCORE_FIXED_EVENT 0xffff +#define UNCORE_FIXED_EVENT 0xff #define UNCORE_PMC_IDX_MAX_GENERIC 8 #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC #define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) -- cgit v1.2.3-70-g09d2 From 3b19e4c98c035c9ab218fc64ef26f4f7a30eafb9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 28 Jun 2012 14:56:36 +0800 Subject: perf/x86: Fix event constraint for SandyBridge-EP C-Box The constraint for C-Box event 0x1f should have overlap flag set. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340866596-22502-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c42a3f7b523..7d755d2e1c9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -239,7 +239,7 @@ static struct event_constraint snbep_uncore_cbox_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1f, 0xe), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), UNCORE_EVENT_CONSTRAINT(0x21, 0x3), UNCORE_EVENT_CONSTRAINT(0x23, 0x3), UNCORE_EVENT_CONSTRAINT(0x31, 0x3), -- cgit v1.2.3-70-g09d2 From 42089697244ba8e64fa43fb5e6d50d47a8e4cb00 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Jul 2012 14:00:14 +0800 Subject: perf/x86: Detect number of instances of uncore CBox The CBox manages the interface between the core and the LLC, so the instances of uncore CBox is equal to number of cores. Reported-by: Andrew Cooks Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341381616-12229-4-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 7d755d2e1c9..4fecbd00ee7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1605,8 +1605,9 @@ static void __init uncore_cpu_setup(void *dummy) static int __init uncore_cpu_init(void) { - int ret, cpu; + int ret, cpu, max_cores; + max_cores = boot_cpu_data.x86_max_cores; switch (boot_cpu_data.x86_model) { case 26: /* Nehalem */ case 30: @@ -1615,9 +1616,13 @@ static int __init uncore_cpu_init(void) msr_uncores = nhm_msr_uncores; break; case 42: /* Sandy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; msr_uncores = snb_msr_uncores; break; case 45: /* Sandy Birdge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; msr_uncores = snbep_msr_uncores; break; default: -- cgit v1.2.3-70-g09d2 From 6a67943a18c264d5f3df436da38edb3e59adc905 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Jul 2012 14:00:15 +0800 Subject: perf/x86: Uncore filter support for SandyBridge-EP This patch adds C-Box and PCU filter support for SandyBridge-EP uncore. We can filter C-Box events by thread/core ID and filter PCU events by frequency/voltage. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341381616-12229-5-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 228 +++++++++++++++++++++----- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 24 ++- 2 files changed, 206 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 4fecbd00ee7..19faffc6088 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -14,10 +14,13 @@ static cpumask_t uncore_cpu_mask; /* constraint for the fixed counter */ static struct event_constraint constraint_fixed = EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); @@ -26,8 +29,19 @@ DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31"); /* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; @@ -120,6 +134,10 @@ static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } @@ -149,6 +167,71 @@ static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); } +static struct event_constraint * +snbep_uncore_get_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + unsigned long flags; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config1 == reg1->config) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (box->phys_id >= 0) + reg1->alloc = 1; + return NULL; + } + return &constraint_empty; +} + +static void snbep_uncore_put_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + if (box->phys_id < 0 || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +static int snbep_uncore_hw_config(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (box->pmu->type == &snbep_uncore_cbox) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & + SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; + } else if (box->pmu->type == &snbep_uncore_pcu) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->config = event->attr.config1 & + SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; + } else { + return 0; + } + reg1->idx = 0; + return 0; +} + static struct attribute *snbep_uncore_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -167,6 +250,20 @@ static struct attribute *snbep_uncore_ubox_formats_attr[] = { NULL, }; +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + static struct attribute *snbep_uncore_pcu_formats_attr[] = { &format_attr_event.attr, &format_attr_occ_sel.attr, @@ -175,6 +272,10 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { &format_attr_thresh5.attr, &format_attr_occ_invert.attr, &format_attr_occ_edge.attr, + &format_attr_filter_brand0.attr, + &format_attr_filter_brand1.attr, + &format_attr_filter_brand2.attr, + &format_attr_filter_brand3.attr, NULL, }; @@ -203,6 +304,11 @@ static struct attribute_group snbep_uncore_ubox_format_group = { .attrs = snbep_uncore_ubox_formats_attr, }; +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + static struct attribute_group snbep_uncore_pcu_format_group = { .name = "format", .attrs = snbep_uncore_pcu_formats_attr, @@ -215,6 +321,9 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = { .disable_event = snbep_uncore_msr_disable_event, .enable_event = snbep_uncore_msr_enable_event, .read_counter = snbep_uncore_msr_read_counter, + .get_constraint = snbep_uncore_get_constraint, + .put_constraint = snbep_uncore_put_constraint, + .hw_config = snbep_uncore_hw_config, }; static struct intel_uncore_ops snbep_uncore_pci_ops = { @@ -307,31 +416,33 @@ static struct intel_uncore_type snbep_uncore_ubox = { }; static struct intel_uncore_type snbep_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 44, - .event_ctl = SNBEP_C0_MSR_PMON_CTL0, - .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = SNBEP_CBO_MSR_OFFSET, - .constraints = snbep_uncore_cbox_constraints, - .ops = &snbep_uncore_msr_ops, - .format_group = &snbep_uncore_format_group, + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_cbox_format_group, }; static struct intel_uncore_type snbep_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, - .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, - .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, - .ops = &snbep_uncore_msr_ops, - .format_group = &snbep_uncore_pcu_format_group, + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, }; static struct intel_uncore_type *snbep_msr_uncores[] = { @@ -747,15 +858,22 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) box->hrtimer.function = uncore_pmu_hrtimer; } -struct intel_uncore_box *uncore_alloc_box(int cpu) +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, + int cpu) { struct intel_uncore_box *box; + int i, size; - box = kmalloc_node(sizeof(*box), GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + size = sizeof(*box) + type->num_shared_regs * + sizeof(struct intel_uncore_extra_reg); + + box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); if (!box) return NULL; + for (i = 0; i < type->num_shared_regs; i++) + raw_spin_lock_init(&box->shared_regs[i].lock); + uncore_pmu_init_hrtimer(box); atomic_set(&box->refcnt, 1); box->cpu = -1; @@ -834,11 +952,18 @@ static int uncore_collect_events(struct intel_uncore_box *box, } static struct event_constraint * -uncore_event_constraint(struct intel_uncore_type *type, - struct perf_event *event) +uncore_get_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) { + struct intel_uncore_type *type = box->pmu->type; struct event_constraint *c; + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + if (event->hw.config == ~0ULL) return &constraint_fixed; @@ -852,19 +977,25 @@ uncore_event_constraint(struct intel_uncore_type *type, return &type->unconstrainted; } +static void uncore_put_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) { unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; - int i, ret, wmin, wmax; + int i, wmin, wmax, ret = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { - c = uncore_event_constraint(box->pmu->type, - box->event_list[i]); + c = uncore_get_event_constraint(box, box->event_list[i]); constraints[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -888,13 +1019,17 @@ static int uncore_assign_events(struct intel_uncore_box *box, break; __set_bit(hwc->idx, used_mask); - assign[i] = hwc->idx; + if (assign) + assign[i] = hwc->idx; } - if (i == n) - return 0; - /* slow path */ - ret = perf_assign_events(constraints, n, wmin, wmax, assign); + if (i != n) + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } return ret ? -EINVAL : 0; } @@ -1021,6 +1156,8 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags) for (i = 0; i < box->n_events; i++) { if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + while (++i < box->n_events) box->event_list[i - 1] = box->event_list[i]; @@ -1048,10 +1185,9 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, { struct perf_event *leader = event->group_leader; struct intel_uncore_box *fake_box; - int assign[UNCORE_PMC_IDX_MAX]; int ret = -EINVAL, n; - fake_box = uncore_alloc_box(smp_processor_id()); + fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); if (!fake_box) return -ENOMEM; @@ -1073,7 +1209,7 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, fake_box->n_events = n; - ret = uncore_assign_events(fake_box, assign, n); + ret = uncore_assign_events(fake_box, NULL, n); out: kfree(fake_box); return ret; @@ -1117,6 +1253,10 @@ int uncore_pmu_event_init(struct perf_event *event) return -EINVAL; event->cpu = box->cpu; + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + if (event->attr.config == UNCORE_FIXED_EVENT) { /* no fixed counter */ if (!pmu->type->fixed_ctl) @@ -1130,11 +1270,13 @@ int uncore_pmu_event_init(struct perf_event *event) hwc->config = ~0ULL; } else { hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } } - event->hw.idx = -1; - event->hw.last_tag = ~0ULL; - if (event->group_leader != event) ret = uncore_validate_group(pmu, event); else @@ -1276,7 +1418,7 @@ static int __devinit uncore_pci_add(struct intel_uncore_type *type, if (phys_id < 0) return -ENODEV; - box = uncore_alloc_box(0); + box = uncore_alloc_box(type, 0); if (!box) return -ENOMEM; @@ -1458,7 +1600,7 @@ static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) if (pmu->func_id < 0) pmu->func_id = j; - box = uncore_alloc_box(cpu); + box = uncore_alloc_box(type, cpu); if (!box) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 88498c7b342..b13e9ea81de 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -97,6 +97,10 @@ SNBEP_PMON_CTL_INVERT | \ SNBEP_U_MSR_PMON_CTL_TRESH_MASK) +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + /* SNB-EP PCU event control */ #define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 #define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 @@ -140,15 +144,17 @@ /* SNB-EP Cbo register */ #define SNBEP_C0_MSR_PMON_CTR0 0xd16 #define SNBEP_C0_MSR_PMON_CTL0 0xd10 -#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 #define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f #define SNBEP_CBO_MSR_OFFSET 0x20 /* SNB-EP PCU register */ #define SNBEP_PCU_MSR_PMON_CTR0 0xc36 #define SNBEP_PCU_MSR_PMON_CTL0 0xc30 -#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 #define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff #define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc #define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd @@ -163,7 +169,6 @@ struct intel_uncore_type { int num_boxes; int perf_ctr_bits; int fixed_ctr_bits; - int single_fixed; unsigned perf_ctr; unsigned event_ctl; unsigned event_mask; @@ -171,6 +176,8 @@ struct intel_uncore_type { unsigned fixed_ctl; unsigned box_ctl; unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -188,6 +195,10 @@ struct intel_uncore_ops { void (*disable_event)(struct intel_uncore_box *, struct perf_event *); void (*enable_event)(struct intel_uncore_box *, struct perf_event *); u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); }; struct intel_uncore_pmu { @@ -200,6 +211,12 @@ struct intel_uncore_pmu { struct list_head box_list; }; +struct intel_uncore_extra_reg { + raw_spinlock_t lock; + u64 config1; + atomic_t ref; +}; + struct intel_uncore_box { int phys_id; int n_active; /* number of active events */ @@ -215,6 +232,7 @@ struct intel_uncore_box { struct intel_uncore_pmu *pmu; struct hrtimer hrtimer; struct list_head list; + struct intel_uncore_extra_reg shared_regs[0]; }; #define UNCORE_BOX_FLAG_INITIATED 0 -- cgit v1.2.3-70-g09d2 From 15fa325bebbe11184a051ed64965164fca8c1df2 Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Mon, 25 Jun 2012 21:32:32 -0600 Subject: x86/PCI: adjust section annotations for pcibios_setup() Make pcibios_setup() consistently use the "__init" section annotation. Signed-off-by: Myron Stowe Signed-off-by: Bjorn Helgaas --- arch/x86/pci/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 0ad990a20d4..720e973fc34 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -494,7 +494,7 @@ int __init pcibios_init(void) return 0; } -char * __devinit pcibios_setup(char *str) +char * __init pcibios_setup(char *str) { if (!strcmp(str, "off")) { pci_probe = 0; -- cgit v1.2.3-70-g09d2 From b39f25a849d7677a7dbf183f2483fd41c201a5ce Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 25 Jun 2012 13:38:27 -0700 Subject: x86/apic: Optimize cpu traversal in __assign_irq_vector() using domain membership Currently __assign_irq_vector() goes through each cpu in the specified mask until it finds a free vector in all the cpu's that are part of the same interrupt domain. We visit all the interrupt domain sibling cpus to reserve the free vector. So, when we fail to find a free vector in an interrupt domain, it is safe to continue our search with a cpu belonging to a new interrupt domain. No need to go through each cpu, if the domain containing that cpu is already visited. Use the irq_cfg's old_domain to track the visited domains and optimize the cpu traversal while finding a free vector in the given cpumask. NOTE: We can also optimize the search by using for_each_cpu() and skip the current cpu, if it is not the first cpu in the mask returned by the vector_allocation_domain(). But re-using the cfg->old_domain to track the visited domains will be slightly faster. Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Acked-by: Alexander Gordeev Acked-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1340656709-11423-2-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 8 +++----- arch/x86/kernel/apic/apic_noop.c | 3 +-- arch/x86/kernel/apic/io_apic.c | 15 ++++++++------- arch/x86/kernel/apic/x2apic_cluster.c | 3 +-- arch/x86/kernel/vsmp_64.c | 3 +-- 5 files changed, 14 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eec240e1209..8bebeb8952f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,7 @@ struct apic { unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); - bool (*vector_allocation_domain)(int cpu, struct cpumask *retmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -614,7 +614,7 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, const struct cpumask *andmask, unsigned int *apicid); -static inline bool +static inline void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus @@ -627,14 +627,12 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask) */ cpumask_clear(retmask); cpumask_bits(retmask)[0] = APIC_ALL_CPUS; - return false; } -static inline bool +static inline void default_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_copy(retmask, cpumask_of(cpu)); - return true; } static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 65c07fc630a..08c337bc49f 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,12 +100,11 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static bool noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); cpumask_copy(retmask, cpumask_of(cpu)); - return true; } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a951ef7decb..8a08f09aa50 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1134,12 +1134,13 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - for_each_cpu_and(cpu, mask, cpu_online_mask) { + cpumask_clear(cfg->old_domain); + cpu = cpumask_first_and(mask, cpu_online_mask); + while (cpu < nr_cpu_ids) { int new_cpu; int vector, offset; - bool more_domains; - more_domains = apic->vector_allocation_domain(cpu, tmp_mask); + apic->vector_allocation_domain(cpu, tmp_mask); if (cpumask_subset(tmp_mask, cfg->domain)) { free_cpumask_var(tmp_mask); @@ -1156,10 +1157,10 @@ next: } if (unlikely(current_vector == vector)) { - if (more_domains) - continue; - else - break; + cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + continue; } if (test_bit(vector, used_vectors)) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 943d03fc6fc..b5d889b5659 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -212,11 +212,10 @@ static int x2apic_cluster_probe(void) /* * Each x2apic cluster is an allocation domain. */ -static bool cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_clear(retmask); cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); - return true; } static struct apic apic_x2apic_cluster = { diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index fa5adb7c228..3f0285ac00f 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -208,10 +208,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) * In vSMP, all cpus should be capable of handling interrupts, regardless of * the APIC used. */ -static bool fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) { cpumask_setall(retmask); - return false; } static void vsmp_apic_post_init(void) -- cgit v1.2.3-70-g09d2 From 1ac322d0b169c95ce34d55b3ed6d40ce1a5f3a02 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 25 Jun 2012 13:38:28 -0700 Subject: x86/apic/x2apic: Limit the vector reservation to the user specified mask For the x2apic cluster mode, vector for an interrupt is currently reserved on all the cpu's that are part of the x2apic cluster. But the interrupts will be routed only to the cluster (derived from the first cpu in the mask) members specified in the mask. So there is no need to reserve the vector in the unused cluster members. Modify __assign_irq_vector() to reserve the vectors based on the user specified irq destination mask. If the new mask is a proper subset of the currently used mask, cleanup the vector allocation on the unused cpu members. Also, allow the apic driver to tune the vector domain based on the affinity mask (which in most cases is the user-specified mask). Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Acked-by: Alexander Gordeev Acked-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1340656709-11423-3-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 9 ++++++--- arch/x86/kernel/apic/apic_noop.c | 3 ++- arch/x86/kernel/apic/io_apic.c | 31 +++++++++++++++---------------- arch/x86/kernel/apic/x2apic_cluster.c | 6 +++--- arch/x86/kernel/vsmp_64.c | 3 ++- 5 files changed, 28 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 8bebeb8952f..88093c1d44f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,8 @@ struct apic { unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, + const struct cpumask *mask); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -615,7 +616,8 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, unsigned int *apicid); static inline void -flat_vector_allocation_domain(int cpu, struct cpumask *retmask) +flat_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -630,7 +632,8 @@ flat_vector_allocation_domain(int cpu, struct cpumask *retmask) } static inline void -default_vector_allocation_domain(int cpu, struct cpumask *retmask) +default_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { cpumask_copy(retmask, cpumask_of(cpu)); } diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 08c337bc49f..e145f28b409 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,7 +100,8 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8a08f09aa50..9684f963bef 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1113,7 +1113,6 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) */ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; - unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; @@ -1123,28 +1122,28 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) return -ENOMEM; - old_vector = cfg->vector; - if (old_vector) { - cpumask_and(tmp_mask, mask, cpu_online_mask); - if (cpumask_subset(tmp_mask, cfg->domain)) { - free_cpumask_var(tmp_mask); - return 0; - } - } - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; cpumask_clear(cfg->old_domain); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { - int new_cpu; - int vector, offset; + int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask); + apic->vector_allocation_domain(cpu, tmp_mask, mask); if (cpumask_subset(tmp_mask, cfg->domain)) { - free_cpumask_var(tmp_mask); - return 0; + err = 0; + if (cpumask_equal(tmp_mask, cfg->domain)) + break; + /* + * New cpumask using the vector is a proper subset of + * the current in use mask. So cleanup the vector + * allocation for the members that are not used anymore. + */ + cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); + cfg->move_in_progress = 1; + cpumask_and(cfg->domain, cfg->domain, tmp_mask); + break; } vector = current_vector; @@ -1172,7 +1171,7 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (old_vector) { + if (cfg->vector) { cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b5d889b5659..bde78d0098a 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -212,10 +212,10 @@ static int x2apic_cluster_probe(void) /* * Each x2apic cluster is an allocation domain. */ -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { - cpumask_clear(retmask); - cpumask_copy(retmask, per_cpu(cpus_in_cluster, cpu)); + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } static struct apic apic_x2apic_cluster = { diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 3f0285ac00f..992f890283e 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -208,7 +208,8 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) * In vSMP, all cpus should be capable of handling interrupts, regardless of * the APIC used. */ -static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { cpumask_setall(retmask); } -- cgit v1.2.3-70-g09d2 From d872818dbbeed1bccf58c7f8c7db432154c802f9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 25 Jun 2012 13:38:29 -0700 Subject: x86/apic/x2apic: Use multiple cluster members for the irq destination only with the explicit affinity During boot or driver load etc, interrupt destination is setup using default target cpu's. Later the user (irqbalance etc) or the driver (irq_set_affinity/ irq_set_affinity_hint) can request the interrupt to be migrated to some specific set of cpu's. In the x2apic cluster routing, for the default scenario use single cpu as the interrupt destination and when there is an explicit interrupt affinity request, route the interrupt to multiple members of a x2apic cluster specified in the cpumask of the migration request. This will minmize the vector pressure when there are lot of interrupt sources and relatively few x2apic clusters (for example a single socket server). This will allow the performance critical interrupts to be routed to multiple cpu's in the x2apic cluster (irqbalance for example uses the cache siblings etc while specifying the interrupt destination) and allow non-critical interrupts to be serviced by a single logical cpu. Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Acked-by: Alexander Gordeev Acked-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1340656709-11423-4-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index bde78d0098a..c88baa4ff0e 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -209,13 +209,30 @@ static int x2apic_cluster_probe(void) return 0; } +static const struct cpumask *x2apic_cluster_target_cpus(void) +{ + return cpu_all_mask; +} + /* * Each x2apic cluster is an allocation domain. */ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, const struct cpumask *mask) { - cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); + /* + * To minimize vector pressure, default case of boot, device bringup + * etc will use a single cpu for the interrupt destination. + * + * On explicit migration requests coming from irqbalance etc, + * interrupts will be routed to the x2apic cluster (cluster-id + * derived from the first cpu in the mask) members specified + * in the mask. + */ + if (mask == x2apic_cluster_target_cpus()) + cpumask_copy(retmask, cpumask_of(cpu)); + else + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } static struct apic apic_x2apic_cluster = { @@ -229,7 +246,7 @@ static struct apic apic_x2apic_cluster = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = online_target_cpus, + .target_cpus = x2apic_cluster_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, -- cgit v1.2.3-70-g09d2 From c3b7cdf180090d2686239a75bb0ae408108ed749 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 6 Jul 2012 12:59:46 +0300 Subject: perf/x86: Fix intel_perfmon_event_mapformatting Use tabs for "intel_perfmon_event_map" formatting in perf_event_intel.c. Signed-off-by: Pekka Enberg Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1341568786-7045-1-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5fdedb4bc3f..1f4c8add675 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -21,14 +21,14 @@ */ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, - [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = -- cgit v1.2.3-70-g09d2 From e676505ac96813e8b93170b1f5e5ffe0cf6a2348 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 8 Jul 2012 17:16:30 +0300 Subject: KVM: MMU: Force cr3 reload with two dimensional paging on mov cr3 emulation Currently the MMU's ->new_cr3() callback does nothing when guest paging is disabled or when two-dimentional paging (e.g. EPT on Intel) is active. This means that an emulated write to cr3 can be lost; kvm_set_cr3() will write vcpu-arch.cr3, but the GUEST_CR3 field in the VMCS will retain its old value and this is what the guest sees. This bug did not have any effect until now because: - with unrestricted guest, or with svm, we never emulate a mov cr3 instruction - without unrestricted guest, and with paging enabled, we also never emulate a mov cr3 instruction - without unrestricted guest, but with paging disabled, the guest's cr3 is ignored until the guest enables paging; at this point the value from arch.cr3 is loaded correctly my the mov cr0 instruction which turns on paging However, the patchset that enables big real mode causes us to emulate mov cr3 instructions in protected mode sometimes (when guest state is not virtualizable by vmx); this mov cr3 is effectively ignored and will crash the guest. The fix is to make nonpaging_new_cr3() call mmu_free_roots() to force a cr3 reload. This is awkward because now all the new_cr3 callbacks to the same thing, and because mmu_free_roots() is somewhat of an overkill; but fixing that is more complicated and will be done after this minimal fix. Observed in the Window XP 32-bit installer while bringing up secondary vcpus. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b53d9e08bf..569cd66ba24 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -188,6 +188,7 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static void mmu_spte_set(u64 *sptep, u64 spte); +static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { @@ -2401,6 +2402,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { + mmu_free_roots(vcpu); } static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, -- cgit v1.2.3-70-g09d2 From d881e6f6cffe3993245963143cab2528f918e071 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 6 Jun 2012 18:36:48 +0300 Subject: KVM: VMX: Return correct CPL during transition to protected mode In protected mode, the CPL is defined as the lower two bits of CS, as set by the last far jump. But during the transition to protected mode, there is no last far jump, so we need to return zero (the inherited real mode CPL). Fix by reading CPL from the cache during the transition. This isn't 100% correct since we don't set the CPL cache on a far jump, but since protected mode transition will always jump to a segment with RPL=0, it will always work. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e10ec0e4d1c..486db2f9561 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3175,11 +3175,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) static int vmx_get_cpl(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations + * fail; use the cache instead. + */ + if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { + return vmx->cpl; + } + if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + vmx->cpl = __vmx_get_cpl(vcpu); } - return to_vmx(vcpu)->cpl; + + return vmx->cpl; } -- cgit v1.2.3-70-g09d2 From 62046e5a867cbff35e0beff42718dda41ff5d74b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 14:07:48 +0300 Subject: KVM: Split cpuid register access from computation Introduce kvm_cpuid() to perform the leaf limit check and calculate register values, and let kvm_emulate_cpuid() just handle reading and writing the registers from/to the vcpu. This allows us to reuse kvm_cpuid() in a context where directly reading and writing registers is not desired. Signed-off-by: Avi Kivity --- arch/x86/kvm/cpuid.c | 40 ++++++++++++++++++++++------------------ arch/x86/kvm/cpuid.h | 1 + 2 files changed, 23 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 61ccbdf3d0a..197afd53e3a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -640,33 +640,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - u32 function, index; + u32 function = *eax, index = *ecx; struct kvm_cpuid_entry2 *best; - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = kvm_find_cpuid_entry(vcpu, function, index); if (!best) best = check_cpuid_limit(vcpu, function, index); if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } + *eax = best->eax; + *ebx = best->ebx; + *ecx = best->ecx; + *edx = best->edx; + } else + *eax = *ebx = *ecx = *edx = 0; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, eax, ebx, ecx, edx; + + function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); + kvm_register_write(vcpu, VCPU_REGS_RAX, eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, - kvm_register_read(vcpu, VCPU_REGS_RAX), - kvm_register_read(vcpu, VCPU_REGS_RBX), - kvm_register_read(vcpu, VCPU_REGS_RCX), - kvm_register_read(vcpu, VCPU_REGS_RDX)); + trace_kvm_cpuid(function, eax, ebx, ecx, edx); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 26d1fb437eb..f449edc35e2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 0017f93a2776597b798ec1a9594e41dfd96d3c11 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 14:10:16 +0300 Subject: KVM: x86 emulator: change ->get_cpuid() accessor to use the x86 semantics Instead of getting an exact leaf, follow the spec and fall back to the last main leaf instead. This lets us easily emulate the cpuid instruction in the emulator. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 4 +-- arch/x86/kvm/emulate.c | 53 +++++++++++++++++++------------------- arch/x86/kvm/x86.c | 20 ++------------ 3 files changed, 30 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1ac46c22dd5..cd5c96b2496 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -192,8 +192,8 @@ struct x86_emulate_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); - bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f95d242ee9f..ba1f8ecaab5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1993,8 +1993,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) u32 eax, ebx, ecx, edx; eax = ecx = 0; - return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) - && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; } @@ -2013,32 +2013,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) eax = 0x00000000; ecx = 0x00000000; - if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { - /* - * Intel ("GenuineIntel") - * remark: Intel CPUs only support "syscall" in 64bit - * longmode. Also an 64bit guest with a - * 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD - * response - CPUs of AMD can't behave like Intel. - */ - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) - return false; + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; - /* AMD ("AuthenticAMD") */ - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) - return true; - - /* AMD ("AMDisbetter!") */ - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) - return true; - } + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; /* default: (not Intel, not AMD), apply Intel's stricter rules... */ return false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8eacb2e6456..ff0b487e725 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4302,26 +4302,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } -static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, +static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - struct kvm_cpuid_entry2 *cpuid = NULL; - - if (eax && ecx) - cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), - *eax, *ecx); - - if (cpuid) { - *eax = cpuid->eax; - *ecx = cpuid->ecx; - if (ebx) - *ebx = cpuid->ebx; - if (edx) - *edx = cpuid->edx; - return true; - } - - return false; + kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); } static struct x86_emulate_ops emulate_ops = { -- cgit v1.2.3-70-g09d2 From 6d6eede4a0492c7478d44d7c8fae80c3bcf529d9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 14:11:36 +0300 Subject: KVM: x86 emulator: emulate cpuid Opcode 0F A2. Used by Linux during the mode change trampoline while in a state that is not virtualizable on vmx without unrestricted_guest, so we need to emulate it is emulate_invalid_guest_state=1. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ba1f8ecaab5..db95a55d593 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3142,6 +3142,20 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_cpuid(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = ctxt->regs[VCPU_REGS_RAX]; + ecx = ctxt->regs[VCPU_REGS_RCX]; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->regs[VCPU_REGS_RAX] = eax; + ctxt->regs[VCPU_REGS_RBX] = ebx; + ctxt->regs[VCPU_REGS_RCX] = ecx; + ctxt->regs[VCPU_REGS_RDX] = edx; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3634,7 +3648,7 @@ static struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), + II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ -- cgit v1.2.3-70-g09d2 From 79d5b4c3cd809c770d4bf9812635647016c56011 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 17:03:42 +0300 Subject: KVM: x86 emulator: allow loading null SS in long mode Null SS is valid in long mode; allow loading it. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index db95a55d593..fe4340f6213 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1324,8 +1324,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto load; } - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + rpl = selector & 3; + cpl = ctxt->ops->cpl(ctxt); + + /* NULL selector is not valid for TR, CS and SS (except for long mode) */ + if ((seg == VCPU_SREG_CS + || (seg == VCPU_SREG_SS + && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) + || seg == VCPU_SREG_TR) && null_selector) goto exception; @@ -1352,9 +1358,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; } - rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ctxt->ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: -- cgit v1.2.3-70-g09d2 From 510425ff3344df03a1f94bce49e659ae302e0d34 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 17:04:36 +0300 Subject: KVM: x86 emulator: fix LIDT/LGDT in long mode The operand size for these instructions is 8 bytes in long mode, even without a REX prefix. Set it explicitly. Triggered while booting Linux with emulate_invalid_guest_state=1. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fe4340f6213..24c84251648 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2997,6 +2997,8 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt) struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); @@ -3024,6 +3026,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); -- cgit v1.2.3-70-g09d2 From f0495f9b9992f80f82b14306946444b287193390 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 17:06:10 +0300 Subject: KVM: VMX: Relax check on unusable segment Some userspace (e.g. QEMU 1.1) munge the d and g bits of segment descriptors, causing us not to recognize them as unusable segments with emulate_invalid_guest_state=1. Relax the check by testing for segment not present (a non-present segment cannot be usable). Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 486db2f9561..82ab1fb2683 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3198,7 +3198,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable) + if (var->unusable || !var->present) ar = 1 << 16; else { ar = var->type & 15; @@ -3210,8 +3210,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) ar |= (var->db & 1) << 14; ar |= (var->g & 1) << 15; } - if (ar == 0) /* a 0 value means unusable */ - ar = AR_UNUSABLE_MASK; return ar; } -- cgit v1.2.3-70-g09d2 From b8405c184b4ef3abcebc5cf2211215944d6e2acc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 17:08:48 +0300 Subject: KVM: VMX: Limit iterations with emulator_invalid_guest_state Otherwise, if the guest ends up looping, we never exit the srcu critical section, which causes synchronize_srcu() to hang. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 82ab1fb2683..debac498434 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4977,11 +4977,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) int ret = 1; u32 cpu_exec_ctrl; bool intr_window_requested; + unsigned count = 130; cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu)) { + while (!guest_state_valid(vcpu) && count-- != 0) { if (intr_window_requested && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) return handle_interrupt_window(&vmx->vcpu); -- cgit v1.2.3-70-g09d2 From f47cfa3174ad8bd39e56524b36e79c463bf820b1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2012 17:49:24 +0300 Subject: KVM: x86 emulator: emulate LEAVE Opcode c9; used by some variants of Windows during boot, in big real mode. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 24c84251648..33ccd757cb1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -433,11 +433,27 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, return ctxt->ops->intercept(ctxt, &info, stage); } +static void assign_masked(ulong *dest, ulong src, ulong mask) +{ + *dest = (*dest & ~mask) | (src & mask); +} + static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { return (1UL << (ctxt->ad_bytes << 3)) - 1; } +static ulong stack_mask(struct x86_emulate_ctxt *ctxt) +{ + u16 sel; + struct desc_struct ss; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + return ~0UL; + ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); + return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ +} + /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) @@ -1560,6 +1576,13 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } +static int em_leave(struct x86_emulate_ctxt *ctxt) +{ + assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], + stack_mask(ctxt)); + return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); +} + static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; @@ -3582,7 +3605,7 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - N, N, N, I(ImplicitOps | Stack, em_ret_far), + N, I(Stack, em_leave), N, I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ -- cgit v1.2.3-70-g09d2 From cbd27ee783f1e56d56415e8c5f2492ccedd565c4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jun 2012 17:11:00 +0300 Subject: KVM: x86 emulator: initialize memop memop is not initialized; this can lead to a two-byte operation following a 4-byte operation to see garbage values. Usually truncation fixes things fot us later on, but at least in one case (call abs) it doesn't. Fix by moving memop to the auto-initialized field area. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index cd5c96b2496..c764f43b71c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -280,9 +280,9 @@ struct x86_emulate_ctxt { u8 modrm_seg; bool rip_relative; unsigned long _eip; + struct operand memop; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; - struct operand memop; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; -- cgit v1.2.3-70-g09d2 From a6e3407bb1570ac5d8d7fc471bca07d531d1dde7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jun 2012 17:15:39 +0300 Subject: KVM: Fix SS default ESP/EBP based addressing We correctly default to SS when BP is used as a base in 16-bit address mode, but we don't do that for 32-bit mode. Fix by adjusting the default to SS when either ESP or EBP is used as the base register. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 33ccd757cb1..7552c0ac6e7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -974,6 +974,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->orig_val = op->val; } +static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) +{ + if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) + ctxt->modrm_seg = VCPU_SREG_SS; +} + static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -1077,15 +1083,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) modrm_ea += insn_fetch(s32, ctxt); - else + else { modrm_ea += ctxt->regs[base_reg]; + adjust_modrm_seg(ctxt, base_reg); + } if (index_reg != 4) modrm_ea += ctxt->regs[index_reg] << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; - } else - modrm_ea += ctxt->regs[ctxt->modrm_rm]; + } else { + base_reg = ctxt->modrm_rm; + modrm_ea += ctxt->regs[base_reg]; + adjust_modrm_seg(ctxt, base_reg); + } switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) -- cgit v1.2.3-70-g09d2 From 96051572c819194c37a8367624b285be10297eca Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jun 2012 17:21:18 +0300 Subject: KVM: x86 emulator: emulate SGDT/SIDT Opcodes 0F 01 /0 and 0F 01 /1 Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7552c0ac6e7..5053e9efb14 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3026,6 +3026,35 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, + void (*get)(struct x86_emulate_ctxt *ctxt, + struct desc_ptr *ptr)) +{ + struct desc_ptr desc_ptr; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + get(ctxt, &desc_ptr); + if (ctxt->op_bytes == 2) { + ctxt->op_bytes = 4; + desc_ptr.address &= 0x00ffffff; + } + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return segmented_write(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); +} + +static int em_sgdt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); +} + +static int em_sidt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); +} + static int em_lgdt(struct x86_emulate_ctxt *ctxt) { struct desc_ptr desc_ptr; @@ -3485,8 +3514,8 @@ static struct opcode group6[] = { }; static struct group_dual group7 = { { - DI(Mov | DstMem | Priv, sgdt), - DI(Mov | DstMem | Priv, sidt), + II(Mov | DstMem | Priv, em_sgdt, sgdt), + II(Mov | DstMem | Priv, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), II(SrcMem | Priv, em_lidt, lidt), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, -- cgit v1.2.3-70-g09d2 From bdea48e305389b3c8c0f786a317f9da984c16604 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jun 2012 18:07:57 +0300 Subject: KVM: VMX: Fix interrupt exit condition during emulation Checking EFLAGS.IF is incorrect as we might be in interrupt shadow. If that is the case, the main loop will notice that and not inject the interrupt, causing an endless loop. Fix by using vmx_interrupt_allowed() to check if we can inject an interrupt instead. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index debac498434..6cdb4045769 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4983,8 +4983,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; while (!guest_state_valid(vcpu) && count-- != 0) { - if (intr_window_requested - && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); err = emulate_instruction(vcpu, 0); -- cgit v1.2.3-70-g09d2 From 7c068e45587a83d4235dda55d35a7d305c58e0e5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jun 2012 18:09:27 +0300 Subject: KVM: VMX: Continue emulating after batch exhausted If we return early from an invalid guest state emulation loop, make sure we return to it later if the guest state is still invalid. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6cdb4045769..2e51e7c6d2a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5002,7 +5002,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = 0; + vmx->emulation_required = !guest_state_valid(vcpu); out: return ret; } -- cgit v1.2.3-70-g09d2 From 2dd7caa092f0b1200a885a418e5d33b222183a71 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 11 Jun 2012 13:09:07 +0300 Subject: KVM: x86 emulator: emulate LAHF Opcode 9F. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5053e9efb14..90b549ed899 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3227,6 +3227,13 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_lahf(struct x86_emulate_ctxt *ctxt) +{ + ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; + ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3622,7 +3629,7 @@ static struct opcode opcode_table[256] = { D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, N, + II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), -- cgit v1.2.3-70-g09d2 From 361cad2b50a2c92b91b6f568db860fabad3bf149 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 11 Jun 2012 19:40:15 +0300 Subject: KVM: x86 emulator: fix byte-sized MOVZX/MOVSX Commit 2adb5ad9fe1 removed ByteOp from MOVZX/MOVSX, replacing them by SrcMem8, but neglected to fix the dependency in the emulation code on ByteOp. This caused the instruction not to have any effect in some circumstances. Fix by replacing the check for ByteOp with the equivalent src.op_bytes == 1. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 90b549ed899..30f4912c6a6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4517,12 +4517,12 @@ twobyte_insn: break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val + ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; case 0xbe ... 0xbf: /* movsx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : + ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ -- cgit v1.2.3-70-g09d2 From 51ddff50cbd77568fe40e17a966b3a2ef1231b36 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 12 Jun 2012 20:19:40 +0300 Subject: KVM: x86 emulator: split push logic from push opcode emulation This allows us to reuse the code without populating ctxt->src and overriding ctxt->op_bytes. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 30f4912c6a6..acc647d6370 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1505,17 +1505,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_push(struct x86_emulate_ctxt *ctxt) +static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; + return segmented_write(ctxt, addr, data, bytes); +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{ /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); + return push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, -- cgit v1.2.3-70-g09d2 From 612e89f01569f562dfa76cd5b76310a42b34a214 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 12 Jun 2012 20:03:23 +0300 Subject: KVM: x86 emulator: implement ENTER Opcode C8. Only ENTER with lexical nesting depth 0 is implemented, since others are very rare. We'll fail emulation if nonzero lexical depth is used so data is not corrupted. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index acc647d6370..b4b326ebc83 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -454,6 +454,11 @@ static ulong stack_mask(struct x86_emulate_ctxt *ctxt) return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ } +static int stack_size(struct x86_emulate_ctxt *ctxt) +{ + return (__fls(stack_mask(ctxt)) + 1) >> 3; +} + /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) @@ -1592,6 +1597,26 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } +static int em_enter(struct x86_emulate_ctxt *ctxt) +{ + int rc; + unsigned frame_size = ctxt->src.val; + unsigned nesting_level = ctxt->src2.val & 31; + + if (nesting_level) + return X86EMUL_UNHANDLEABLE; + + rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); + if (rc != X86EMUL_CONTINUE) + return rc; + assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], + stack_mask(ctxt)); + assign_masked(&ctxt->regs[VCPU_REGS_RSP], + ctxt->regs[VCPU_REGS_RSP] - frame_size, + stack_mask(ctxt)); + return X86EMUL_CONTINUE; +} + static int em_leave(struct x86_emulate_ctxt *ctxt) { assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], @@ -3657,7 +3682,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - N, I(Stack, em_leave), N, I(ImplicitOps | Stack, em_ret_far), + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), + N, I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ -- cgit v1.2.3-70-g09d2 From de87dcddc70ec6a90adfcc81f0ad7d84a892ffce Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 12 Jun 2012 20:21:38 +0300 Subject: KVM: VMX: Stop invalid guest state emulation on pending event Process the event, possibly injecting an interrupt, before continuing. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2e51e7c6d2a..a62f92ab1be 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4986,6 +4986,9 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); + if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + return 1; + err = emulate_instruction(vcpu, 0); if (err == EMULATE_DO_MMIO) { -- cgit v1.2.3-70-g09d2 From de5f70e0c65fcd0472a412a7a9690afcd3ee4526 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 12 Jun 2012 20:22:28 +0300 Subject: KVM: VMX: Improve error reporting during invalid guest state emulation If instruction emulation fails, report it properly to userspace. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a62f92ab1be..c61eb34a39e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4996,8 +4996,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) + if (err != EMULATE_DONE) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; + } if (signal_pending(current)) goto out; -- cgit v1.2.3-70-g09d2 From 9299836e6379d5703826a540fb3c704223fac520 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 13 Jun 2012 12:25:06 +0300 Subject: KVM: x86 emulator: emulate BSWAP Opcodes 0F C8 - 0F CF. Used by the SeaBIOS cdrom code (though not in big real mode). Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b4b326ebc83..cfa5cc30c1d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3264,6 +3264,21 @@ static int em_lahf(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_bswap(struct x86_emulate_ctxt *ctxt) +{ + switch (ctxt->op_bytes) { +#ifdef CONFIG_X86_64 + case 8: + asm("bswap %0" : "+r"(ctxt->dst.val)); + break; +#endif + default: + asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); + break; + } + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3780,11 +3795,12 @@ static struct opcode twobyte_table[256] = { I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xC0 - 0xCF */ + /* 0xC0 - 0xC7 */ D2bv(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), - N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + X8(I(DstReg, em_bswap)), /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ -- cgit v1.2.3-70-g09d2 From a14e579f224ba929fe2f1d9bbbff688ae67e2ec4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 13 Jun 2012 12:28:33 +0300 Subject: KVM: x86 emulator: emulate LLDT Opcode 0F 00 /2. Used by isolinux durign the protected mode transition. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cfa5cc30c1d..7b575adaf1f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3015,6 +3015,15 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } +static int em_lldt(struct x86_emulate_ctxt *ctxt) +{ + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); +} + static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; @@ -3560,7 +3569,7 @@ static struct opcode group5[] = { static struct opcode group6[] = { DI(Prot, sldt), DI(Prot, str), - DI(Prot | Priv, lldt), + II(Prot | Priv | SrcMem16, em_lldt, lldt), DI(Prot | Priv, ltr), N, N, N, N, }; -- cgit v1.2.3-70-g09d2 From e919464b53ea29aed46ff10f7d6416268678bdb9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 13 Jun 2012 16:29:39 +0300 Subject: KVM: x86 emulator: make read_segment_descriptor() return the address Some operations want to modify the descriptor later on, so save the address for future use. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7b575adaf1f..99e3df2bf88 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1296,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, /* allowed just for 8 bytes segments */ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc) + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1307,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - addr = dt.address + index * 8; + *desc_addr_p = addr = dt.address + index * 8; return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1339,6 +1340,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + ulong desc_addr; int ret; memset(&seg_desc, 0, sizeof seg_desc); @@ -1374,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (null_selector) /* for NULL selector skip all following checks */ goto load; - ret = read_segment_descriptor(ctxt, selector, &seg_desc); + ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; @@ -2614,13 +2616,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; + ulong desc_addr; /* FIXME: old_tss_base == ~0 ? */ - ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); + ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; - ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); + ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; -- cgit v1.2.3-70-g09d2 From 869be99c7579c857885643ba2aed87ced339c6a2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 13 Jun 2012 16:30:53 +0300 Subject: KVM: x86 emulator: make loading TR set the busy bit Guest software doesn't actually depend on it, but vmx will refuse us entry if we don't. Set the bit in both the cached segment and memory, just to be nice. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 99e3df2bf88..92a1adde0b4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1335,7 +1335,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { - struct desc_struct seg_desc; + struct desc_struct seg_desc, old_desc; u8 dpl, rpl, cpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; @@ -1422,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) -- cgit v1.2.3-70-g09d2 From 80890006167ec3e570bfd7cee7a05db17d339726 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 13 Jun 2012 16:33:29 +0300 Subject: KVM: x86 emulator: implement LTR Opcode 0F 00 /3. Encountered during Windows XP secondary processor bringup. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 92a1adde0b4..97d9a9914ba 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3033,6 +3033,15 @@ static int em_lldt(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); } +static int em_ltr(struct x86_emulate_ctxt *ctxt) +{ + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); +} + static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; @@ -3579,7 +3588,7 @@ static struct opcode group6[] = { DI(Prot, sldt), DI(Prot, str), II(Prot | Priv | SrcMem16, em_lldt, lldt), - DI(Prot | Priv, ltr), + II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, }; -- cgit v1.2.3-70-g09d2 From a27685c33acccce91268ddef88d7896e3205fda5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 12 Jun 2012 20:30:18 +0300 Subject: KVM: VMX: Emulate invalid guest state by default Our emulation should be complete enough that we can emulate guests while they are in big real mode, or in a mode transition that is not virtualizable without unrestricted guest support. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c61eb34a39e..a2b9dd9af62 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -74,7 +74,7 @@ module_param_named(unrestricted_guest, static bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); -static bool __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly vmm_exclusive = 1; -- cgit v1.2.3-70-g09d2 From 2a8ac745e3171889d364235b8203342e28526d2c Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 6 Jul 2012 16:08:25 +0200 Subject: x86: CONFIG_CC_STACKPROTECTOR=y is no longer experimental This feature has been around for over 5 years now, and has no CONFIG_EXPERIMENTAL dependency anymore, so remove the '(EXPERIMENTAL)' tag from the help text as well. Signed-off-by: Jean Delvare Acked-by: Arjan van de Ven Link: http://lkml.kernel.org/r/1341583705.4655.18.camel@amber.site Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c70684f859e..b03fe17042e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1525,7 +1525,7 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. config CC_STACKPROTECTOR - bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" + bool "Enable -fstack-protector buffer overflow detection" ---help--- This option turns on the -fstack-protector GCC feature. This feature puts, at the beginning of functions, a canary value on -- cgit v1.2.3-70-g09d2 From 1ba9a294141b106b7247649a5c3372d8284eca80 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 6 Jul 2012 15:07:48 +0100 Subject: x86/mm/mtrr: Fix alignment determination in range_to_mtrr() With the variable operated on being of "unsigned long" type, neither ffs() nor fls() are suitable to use on them, as those truncate their arguments to 32 bits. Using __ffs() and __fls() respectively at once eliminates the need to subtract 1 from their results. Additionally, with the alignment value subsequently used as a shift count, it must be enforced to be less than BITS_PER_LONG (and on 64-bit there's no need for it to be any smaller). Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4FF70D54020000780008E179@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index bdda2e6c673..35ffda5d072 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, /* Compute the maximum size with which we can make a range: */ if (range_startk) - max_align = ffs(range_startk) - 1; + max_align = __ffs(range_startk); else - max_align = 32; + max_align = BITS_PER_LONG - 1; - align = fls(range_sizek) - 1; + align = __fls(range_sizek); if (align > max_align) align = max_align; -- cgit v1.2.3-70-g09d2 From a7101d152665817bf7cafc47e7f5dcb1f54664fe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 6 Jul 2012 15:20:35 +0100 Subject: x86/mm/mtrr: Slightly simplify print_mtrr_state() high_width can be easily calculated in a single expression when making use of __ffs64(). Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4FF71053020000780008E1B5@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 75772ae6c65..e9fe907cd24 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -361,11 +361,7 @@ static void __init print_mtrr_state(void) } pr_debug("MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - if (size_or_mask & 0xffffffffUL) - high_width = ffs(size_or_mask & 0xffffffffUL) - 1; - else - high_width = ffs(size_or_mask>>32) + 32 - 1; - high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) -- cgit v1.2.3-70-g09d2 From bf084d8f6eb4ded3f90a6ab79bb682db00ebfbd4 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 28 Jun 2012 17:26:02 +0200 Subject: crypto: aesni-intel - fix wrong kfree pointer kfree(new_key_mem) in rfc4106_set_key() should be called on malloced pointer, not on aligned one, otherwise it can cause invalid pointer on free. (Seen at least once when running tcrypt tests with debug kernel.) Signed-off-by: Milan Broz Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index d6626152067..34fdcff4d2c 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -529,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); struct aesni_rfc4106_gcm_ctx *child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); - u8 *new_key_mem = NULL; + u8 *new_key_align, *new_key_mem = NULL; if (key_len < 4) { crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); @@ -553,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, if (!new_key_mem) return -ENOMEM; - new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); - memcpy(new_key_mem, key, key_len); - key = new_key_mem; + new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN); + memcpy(new_key_align, key, key_len); + key = new_key_align; } if (!irq_fpu_usable()) -- cgit v1.2.3-70-g09d2 From a43478863b16cb0986fd2ec9d1f1b9ebaaec5922 Mon Sep 17 00:00:00 2001 From: Johannes Goetzfried Date: Thu, 5 Jul 2012 20:43:58 +0200 Subject: crypto: twofish-avx - remove useless instruction The register %rdx is written, but never read till the end of the encryption routine. Therefore let's delete the useless instruction. Signed-off-by: Johannes Goetzfried Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index fc31b89ba4c..35f45574390 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -243,7 +243,6 @@ __twofish_enc_blk_8way: popq %rbx; leaq (4*4*4)(%rsi), %rax; - leaq (4*4*4)(%rax), %rdx; testb %cl, %cl; jnz __enc_xor8; -- cgit v1.2.3-70-g09d2 From 2f84569f978cd7d54970d893b4c4e68ef24dc1ec Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:56:53 +0800 Subject: KVM: MMU: return bool in __rmap_write_protect The reture value of __rmap_write_protect is either 1 or 0, use true/false instead of these Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 569cd66ba24..5dd22427251 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1050,11 +1050,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) +static bool +__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { u64 *sptep; struct rmap_iterator iter; - int write_protected = 0; + bool write_protected = false; for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); @@ -1075,7 +1076,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level sptep = rmap_get_first(*rmapp, &iter); } - write_protected = 1; + write_protected = true; } return write_protected; @@ -1106,12 +1107,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; - int write_protected = 0; + bool write_protected = false; slot = gfn_to_memslot(kvm, gfn); @@ -1700,7 +1701,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { - int protected = 0; + bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu->kvm, sp->gfn); -- cgit v1.2.3-70-g09d2 From d13bc5b5a1f9eafd59331baa1d1d32e1867f57b5 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:57:15 +0800 Subject: KVM: MMU: abstract spte write-protect Introduce a common function to abstract spte write-protect to cleanup the code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 58 +++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5dd22427251..d04d6305a72 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1050,36 +1050,48 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +/* Return true if the spte is dropped. */ +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) +{ + u64 spte = *sptep; + + if (!is_writable_pte(spte)) + return false; + + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + + *flush |= true; + if (is_large_pte(spte)) { + WARN_ON(page_header(__pa(sptep))->role.level == + PT_PAGE_TABLE_LEVEL); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + return true; + } + + spte = spte & ~PT_WRITABLE_MASK; + mmu_spte_update(sptep, spte); + return false; +} + static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { u64 *sptep; struct rmap_iterator iter; - bool write_protected = false; + bool flush = false; for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - - if (!is_writable_pte(*sptep)) { - sptep = rmap_get_next(&iter); - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); - sptep = rmap_get_next(&iter); - } else { - BUG_ON(!is_large_pte(*sptep)); - drop_spte(kvm, sptep); - --kvm->stat.lpages; + if (spte_write_protect(kvm, sptep, &flush)) { sptep = rmap_get_first(*rmapp, &iter); + continue; } - write_protected = true; + sptep = rmap_get_next(&iter); } - return write_protected; + return flush; } /** @@ -3886,6 +3898,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + bool flush = false; list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; @@ -3900,16 +3913,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) !is_last_spte(pt[i], sp->role.level)) continue; - if (is_large_pte(pt[i])) { - drop_spte(kvm, &pt[i]); - --kvm->stat.lpages; - continue; - } - - /* avoid RMW */ - if (is_writable_pte(pt[i])) - mmu_spte_update(&pt[i], - pt[i] & ~PT_WRITABLE_MASK); + spte_write_protect(kvm, &pt[i], &flush); } } kvm_flush_remote_tlbs(kvm); -- cgit v1.2.3-70-g09d2 From 8e22f955fb65c5930cc4c5a863cce4e27d0e4a3c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:57:39 +0800 Subject: KVM: MMU: cleanup spte_write_protect Use __drop_large_spte to cleanup this function and comment spte_write_protect Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d04d6305a72..ed9e9680608 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1050,7 +1050,33 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -/* Return true if the spte is dropped. */ + +static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + WARN_ON(page_header(__pa(sptep))->role.level == + PT_PAGE_TABLE_LEVEL); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + return true; + } + + return false; +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (__drop_large_spte(vcpu->kvm, sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); +} + +/* + * Write-protect on the specified @sptep due to dirty page logging or + * protecting shadow page table. @flush indicates whether tlb need be + * flushed. + * + * Return true if the spte is dropped. + */ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) { u64 spte = *sptep; @@ -1061,13 +1087,9 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); *flush |= true; - if (is_large_pte(spte)) { - WARN_ON(page_header(__pa(sptep))->role.level == - PT_PAGE_TABLE_LEVEL); - drop_spte(kvm, sptep); - --kvm->stat.lpages; + + if (__drop_large_spte(kvm, sptep)) return true; - } spte = spte & ~PT_WRITABLE_MASK; mmu_spte_update(sptep, spte); @@ -1878,15 +1900,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) mmu_spte_set(sptep, spte); } -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep); - --vcpu->kvm->stat.lpages; - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { -- cgit v1.2.3-70-g09d2 From 4f5982a56a70a4a8b7966ef458d3fcdd27aa16cf Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:58:04 +0800 Subject: KVM: VMX: export PFEC.P bit on ept Export the present bit of page fault error code, the later patch will use it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a2b9dd9af62..5c52a6d2990 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4838,6 +4838,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; + u32 error_code; int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4862,7 +4863,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); + + /* It is a write fault? */ + error_code = exit_qualification & (1U << 1); + /* ept page table is present? */ + error_code |= (exit_qualification >> 3) & 0x1; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } static u64 ept_rsvd_mask(u64 spte, int level) -- cgit v1.2.3-70-g09d2 From 6e7d035407dc402a313e466c4f7ccb21aaed0da2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:58:33 +0800 Subject: KVM: MMU: fold tlb flush judgement into mmu_spte_update mmu_spte_update() is the common function, we can easily audit the path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ed9e9680608..a2fc65ba76a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -479,15 +479,24 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changged. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. */ -static void mmu_spte_update(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 mask, old_spte = *sptep; + bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); - if (!is_shadow_present_pte(old_spte)) - return mmu_spte_set(sptep, new_spte); + if (!is_shadow_present_pte(old_spte)) { + mmu_spte_set(sptep, new_spte); + return ret; + } new_spte |= old_spte & shadow_dirty_mask; @@ -500,13 +509,18 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); + if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) + ret = true; + if (!shadow_accessed_mask) - return; + return ret; if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + return ret; } /* @@ -2268,7 +2282,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte, entry = *sptep; + u64 spte; int ret = 0; if (set_mmio_spte(sptep, gfn, pfn, pte_access)) @@ -2346,14 +2360,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - mmu_spte_update(sptep, spte); - /* - * If we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB. - */ - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) + if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); done: return ret; -- cgit v1.2.3-70-g09d2 From 49fde3406f3266c5af9430467672c20b63a31e83 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:58:58 +0800 Subject: KVM: MMU: introduce SPTE_MMU_WRITEABLE bit This bit indicates whether the spte can be writable on MMU, that means the corresponding gpte is writable and the corresponding gfn is not protected by shadow page protection In the later path, SPTE_MMU_WRITEABLE will indicates whether the spte can be locklessly updated Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 57 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a2fc65ba76a..b160652f7ee 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); #define CREATE_TRACE_POINTS #include "mmutrace.h" -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1084,34 +1085,51 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) kvm_flush_remote_tlbs(vcpu->kvm); } +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} + /* - * Write-protect on the specified @sptep due to dirty page logging or - * protecting shadow page table. @flush indicates whether tlb need be - * flushed. + * Write-protect on the specified @sptep, @pt_protect indicates whether + * spte writ-protection is caused by protecting shadow page table. + * @flush indicates whether tlb need be flushed. + * + * Note: write protection is difference between drity logging and spte + * protection: + * - for dirty logging, the spte can be set to writable at anytime if + * its dirty bitmap is properly set. + * - for spte protection, the spte can be writable only after unsync-ing + * shadow page. * * Return true if the spte is dropped. */ -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) +static bool +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) { u64 spte = *sptep; - if (!is_writable_pte(spte)) + if (!is_writable_pte(spte) && + !(pt_protect && spte_is_locklessly_modifiable(spte))) return false; rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - *flush |= true; - - if (__drop_large_spte(kvm, sptep)) + if (__drop_large_spte(kvm, sptep)) { + *flush |= true; return true; + } + if (pt_protect) + spte &= ~SPTE_MMU_WRITEABLE; spte = spte & ~PT_WRITABLE_MASK; - mmu_spte_update(sptep, spte); + + *flush |= mmu_spte_update(sptep, spte); return false; } -static bool -__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) +static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, + int level, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1119,7 +1137,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - if (spte_write_protect(kvm, sptep, &flush)) { + if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { sptep = rmap_get_first(*rmapp, &iter); continue; } @@ -1148,7 +1166,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); /* clear the first set bit */ mask &= mask - 1; @@ -1167,7 +1185,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); + write_protected |= __rmap_write_protect(kvm, rmapp, i, true); } return write_protected; @@ -2296,8 +2314,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_x_mask; else spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) @@ -2322,7 +2342,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto done; } - spte |= PT_WRITABLE_MASK; + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; if (!vcpu->arch.mmu.direct_map && !(pte_access & ACC_WRITE_MASK)) { @@ -2351,8 +2371,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writable_pte(spte)) - spte &= ~PT_WRITABLE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } } @@ -3933,7 +3952,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) !is_last_spte(pt[i], sp->role.level)) continue; - spte_write_protect(kvm, &pt[i], &flush); + spte_write_protect(kvm, &pt[i], &flush, false); } } kvm_flush_remote_tlbs(kvm); -- cgit v1.2.3-70-g09d2 From c7ba5b48cc8ddc015a9e0463813ca1e60bc42c59 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:59:18 +0800 Subject: KVM: MMU: fast path of handling guest page fault If the the present bit of page fault error code is set, it indicates the shadow page is populated on all levels, it means what we do is only modify the access bit which can be done out of mmu-lock Currently, in order to simplify the code, we only fix the page fault caused by write-protect on the fast path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 127 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b160652f7ee..8637bffbdb4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -446,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) } #endif +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} + static bool spte_has_volatile_bits(u64 spte) { + /* + * Always atomicly update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (spte_is_locklessly_modifiable(spte)) + return true; + if (!shadow_accessed_mask) return false; @@ -489,7 +503,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) */ static bool mmu_spte_update(u64 *sptep, u64 new_spte) { - u64 mask, old_spte = *sptep; + u64 old_spte = *sptep; bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); @@ -499,17 +513,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) return ret; } - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; - - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) + if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); + /* + * For the spte updated out of mmu-lock is safe, since + * we always atomicly update it, see the comments in + * spte_has_volatile_bits(). + */ if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) ret = true; @@ -1085,11 +1098,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) kvm_flush_remote_tlbs(vcpu->kvm); } -static bool spte_is_locklessly_modifiable(u64 spte) -{ - return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); -} - /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte writ-protection is caused by protecting shadow page table. @@ -2677,18 +2685,114 @@ exit: return ret; } +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +{ + /* + * #PF can be fast only if the shadow page table is present and it + * is caused by write-protect, that means we just need change the + * W bit of the spte which can be done out of mmu-lock. + */ + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + return true; +} + +static bool +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + gfn_t gfn; + + WARN_ON(!sp->role.direct); + + /* + * The gfn of direct spte is stable since it is calculated + * by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + mark_page_dirty(vcpu->kvm, gfn); + + return true; +} + +/* + * Return value: + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, + u32 error_code) +{ + struct kvm_shadow_walk_iterator iterator; + bool ret = false; + u64 spte = 0ull; + + if (!page_fault_can_be_fast(vcpu, error_code)) + return false; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || iterator.level < level) + break; + + /* + * If the mapping has been changed, let the vcpu fault on the + * same address again. + */ + if (!is_rmap_spte(spte)) { + ret = true; + goto exit; + } + + if (!is_last_spte(spte, level)) + goto exit; + + /* + * Check if it is a spurious fault caused by TLB lazily flushed. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_writable_pte(spte)) { + ret = true; + goto exit; + } + + /* + * Currently, to simplify the code, only the spte write-protected + * by dirty-log can be fast fixed. + */ + if (!spte_is_locklessly_modifiable(spte)) + goto exit; + + /* + * Currently, fast page fault only works for direct mapping since + * the gfn is not stable for indirect shadow page. + * See Documentation/virtual/kvm/locking.txt to get more detail. + */ + ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); +exit: + walk_shadow_page_lockless_end(vcpu); + + return ret; +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + gfn_t gfn, bool prefault) { int r; int level; int force_pt_level; pfn_t pfn; unsigned long mmu_seq; - bool map_writable; + bool map_writable, write = error_code & PFERR_WRITE_MASK; force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); if (likely(!force_pt_level)) { @@ -2705,6 +2809,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, v, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3093,7 +3200,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); + error_code, gfn, prefault); } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) @@ -3173,6 +3280,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, gpa, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); -- cgit v1.2.3-70-g09d2 From a72faf2504dfc12ff9bfb486a42f2761296666ff Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:59:41 +0800 Subject: KVM: MMU: trace fast page fault To see what happen on this path and help us to optimize it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ arch/x86/kvm/mmutrace.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8637bffbdb4..28c8fbcc676 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2776,6 +2776,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, */ ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); exit: + trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322..c364abc8d03 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -243,6 +243,44 @@ TRACE_EVENT( TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, __entry->access) ); + +#define __spte_satisfied(__spte) \ + (__entry->retry && is_writable_pte(__entry->__spte)) + +TRACE_EVENT( + fast_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + u64 *sptep, u64 old_spte, bool retry), + TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gva_t, gva) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) + __field(u64, new_spte) + __field(bool, retry) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gva = gva; + __entry->error_code = error_code; + __entry->sptep = sptep; + __entry->old_spte = old_spte; + __entry->new_spte = *sptep; + __entry->retry = retry; + ), + + TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + " new %llx spurious %d fixed %d", __entry->vcpu_id, + __entry->gva, __print_flags(__entry->error_code, "|", + kvm_mmu_trace_pferr_flags), __entry->sptep, + __entry->old_spte, __entry->new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3-70-g09d2 From 6fbc277053836a4d80c72a0843bcbc7595b31e87 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 16:00:00 +0800 Subject: KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint The P bit of page fault error code is missed in this tracepoint, fix it by passing the full error code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmutrace.h | 7 +++---- arch/x86/kvm/paging_tmpl.h | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index c364abc8d03..cd6e98333ba 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -54,8 +54,8 @@ */ TRACE_EVENT( kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), - TP_ARGS(addr, write_fault, user_fault, fetch_fault), + TP_PROTO(u64 addr, u32 pferr), + TP_ARGS(addr, pferr), TP_STRUCT__entry( __field(__u64, addr) @@ -64,8 +64,7 @@ TRACE_EVENT( TP_fast_assign( __entry->addr = addr; - __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) - | (!!fetch_fault << 4); + __entry->pferr = pferr; ), TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 34f970937ef..bb7cf01cae7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, - fetch_fault); + trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: eperm = false; walker->level = mmu->root_level; -- cgit v1.2.3-70-g09d2 From fc73373b33f5f965f2f82bfbc40ef8e6072e986d Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Fri, 6 Jul 2012 13:47:39 -0400 Subject: KVM: Add x86_hyper_kvm to complete detect_hypervisor_platform check While debugging I noticed that unlike all the other hypervisor code in the kernel, kvm does not have an entry for x86_hyper which is used in detect_hypervisor_platform() which results in a nice printk in the syslog. This is only really a stub function but it does make kvm more consistent with the other hypervisors. Signed-off-by: Prarit Bhargava Cc: Avi Kivity Cc: Gleb Natapov Cc: Alex Williamson Cc: Konrad Rzeszutek Wilk Cc: Marcelo Tostatti Cc: kvm@vger.kernel.org Signed-off-by: Avi Kivity --- arch/x86/include/asm/hypervisor.h | 1 + arch/x86/kernel/cpu/hypervisor.c | 1 + arch/x86/kernel/kvm.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 7a15153c675..b518c750993 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper; extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_kvm; static inline bool hypervisor_x2apic_available(void) { diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 755f64fb074..6d6dd7afb22 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -37,6 +37,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, + &x86_hyper_kvm, }; const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 75ab94c75c7..299cf147092 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -41,6 +41,7 @@ #include #include #include +#include static int kvmapf = 1; @@ -483,6 +484,19 @@ void __init kvm_guest_init(void) #endif } +static bool __init kvm_detect(void) +{ + if (!kvm_para_available()) + return false; + return true; +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); + static __init int activate_jump_labels(void) { if (has_steal_clock) { -- cgit v1.2.3-70-g09d2 From 6751ed65dc6642af64f7b8a440a75563c8aab7ae Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 11 Jul 2012 10:20:47 -0700 Subject: x86/mce: Fix siginfo_t->si_addr value for non-recoverable memory faults In commit dad1743e5993f1 ("x86/mce: Only restart instruction after machine check recovery if it is safe") we fixed mce_notify_process() to force a signal to the current process if it was not restartable (RIPV bit not set in MCG_STATUS). But doing it here means that the process doesn't get told the virtual address of the fault via siginfo_t->si_addr. This would prevent application level recovery from the fault. Make a new MF_MUST_KILL flag bit for memory_failure() et al. to use so that we will provide the right information with the signal. Signed-off-by: Tony Luck Acked-by: Borislav Petkov Cc: stable@kernel.org # 3.4+ --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++-- include/linux/mm.h | 1 + mm/memory-failure.c | 14 ++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index da27c5d2168..c46ed494f00 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1186,6 +1186,7 @@ void mce_notify_process(void) { unsigned long pfn; struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; if (!mi) mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); @@ -1200,8 +1201,9 @@ void mce_notify_process(void) * doomed. We still need to mark the page as poisoned and alert any * other users of the page. */ - if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || - mi->restartable == 0) { + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { pr_err("Memory error not recovered"); force_sig(SIGBUS, current); } diff --git a/include/linux/mm.h b/include/linux/mm.h index b36d08ce5c5..f9f279cf5b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1591,6 +1591,7 @@ void vmemmap_populate_print_last(void); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, + MF_MUST_KILL = 1 << 2, }; extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ab1e7145e29..de4ce705845 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * Also when FAIL is set do a force kill because something went * wrong earlier. */ -static void kill_procs(struct list_head *to_kill, int doit, int trapno, +static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, int fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; list_for_each_entry_safe (tk, next, to_kill, nd) { - if (doit) { + if (forcekill) { /* * In case something went wrong with munmapping * make sure the process doesn't catch the @@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int kill = 1; + int kill = 1, forcekill; struct page *hpage = compound_head(p); struct page *ppage; @@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(hpage); - if (!PageDirty(hpage) && mapping && + if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && mapping_cap_writeback_dirty(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); @@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if * killing is needed or not. Only kill when the page - * was dirty, otherwise the tokill list is merely + * was dirty or the process is not restartable, + * otherwise the tokill list is merely * freed. When there was a problem unmapping earlier * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs(&tokill, !!PageDirty(ppage), trapno, + forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); return ret; -- cgit v1.2.3-70-g09d2 From ad756a1603c5fac207758faaac7f01c34c9d0b7b Mon Sep 17 00:00:00 2001 From: "Mao, Junjie" Date: Mon, 2 Jul 2012 01:18:48 +0000 Subject: KVM: VMX: Implement PCID/INVPCID for guests with EPT This patch handles PCID/INVPCID for guests. Process-context identifiers (PCIDs) are a facility by which a logical processor may cache information for multiple linear-address spaces so that the processor may retain cached information when software switches to a different linear address space. Refer to section 4.10.1 in IA32 Intel Software Developer's Manual Volume 3A for details. For guests with EPT, the PCID feature is enabled and INVPCID behaves as running natively. For guests without EPT, the PCID feature is disabled and INVPCID triggers #UD. Signed-off-by: Junjie Mao Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 +++- arch/x86/include/asm/processor-flags.h | 2 ++ arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/cpuid.c | 5 +++-- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 34 +++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 23 ++++++++++++++++++++--- 8 files changed, 77 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 24b76474d9d..a3e9409e90b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,12 +48,13 @@ #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) @@ -673,6 +674,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); + bool (*invpcid_supported)(void); void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index f8ab3eaad12..aea1d1d848c 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -44,6 +44,7 @@ */ #define X86_CR3_PWT 0x00000008 /* Page Write Through */ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ /* * Intel CPU features in CR4 @@ -61,6 +62,7 @@ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ +#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index de007c27273..74fcb963595 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -60,6 +60,7 @@ #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -281,6 +282,7 @@ enum vmcs_field { #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_INVPCID 58 /* * Interruption-information format diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 197afd53e3a..0595f1397b7 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | + F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); @@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f449edc35e2..a10e4601685 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -52,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_OSVW)); } +static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_PCID)); +} + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7a418783259..baead950d6c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void) return false; } +static bool svm_invpcid_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cpuid_update = svm_cpuid_update, .rdtscp_supported = svm_rdtscp_supported, + .invpcid_supported = svm_invpcid_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c52a6d2990..c39b60707e0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -861,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void) SECONDARY_EXEC_RDTSCP; } +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -1751,6 +1757,11 @@ static bool vmx_rdtscp_supported(void) return cpu_has_vmx_rdtscp(); } +static bool vmx_invpcid_supported(void) +{ + return cpu_has_vmx_invpcid() && enable_ept; +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -2470,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_INVPCID; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3800,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; + /* Enable INVPCID for non-ept guests may cause performance regression. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -6550,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } } + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + /* Exposing INVPCID only when PCID is exposed */ + best = kvm_find_cpuid_entry(vcpu, 0x7, 0); + if (vmx_invpcid_supported() && + best && (best->ecx & bit(X86_FEATURE_INVPCID)) && + guest_cpuid_has_pcid(vcpu)) { + exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } else { + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + if (best) + best->ecx &= ~bit(X86_FEATURE_INVPCID); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -7284,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cpuid_update = vmx_cpuid_update, .rdtscp_supported = vmx_rdtscp_supported, + .invpcid_supported = vmx_invpcid_supported, .set_supported_cpuid = vmx_set_supported_cpuid, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff0b487e725..59b59508ff0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } + if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return 1; + kvm_x86_ops->set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { @@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_read_cr3(vcpu))) return 1; + if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { + if (!guest_cpuid_has_pcid(vcpu)) + return 1; + + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) + return 1; + } + if (kvm_x86_ops->set_cr4(vcpu, cr4)) return 1; - if ((cr4 ^ old_cr4) & pdptr_bits) + if (((cr4 ^ old_cr4) & pdptr_bits) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) @@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; + if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { + if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) + return 1; + } else + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) -- cgit v1.2.3-70-g09d2 From 18468843fac331dfbb700901c8012d17373adfec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Jul 2012 22:36:28 +0200 Subject: olpc-xo15-sci: Use struct dev_pm_ops for power management Make the OLPC XO15 SCI driver define its resume callback through a struct dev_pm_ops object rather than by using a legacy PM hook in struct acpi_device_ops. Reported-by: Randy Dunlap Signed-off-by: Rafael J. Wysocki Acked-by: Randy Dunlap --- arch/x86/platform/olpc/olpc-xo15-sci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 23e5b9d7977..599be499fdf 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -203,7 +203,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type) return 0; } -static int xo15_sci_resume(struct acpi_device *device) +static int xo15_sci_resume(struct device *dev) { /* Enable all EC events */ olpc_ec_mask_write(EC_SCI_SRC_ALL); @@ -215,6 +215,8 @@ static int xo15_sci_resume(struct acpi_device *device) return 0; } +static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume); + static const struct acpi_device_id xo15_sci_device_ids[] = { {"XO15EC", 0}, {"", 0}, @@ -227,8 +229,8 @@ static struct acpi_driver xo15_sci_drv = { .ops = { .add = xo15_sci_add, .remove = xo15_sci_remove, - .resume = xo15_sci_resume, }, + .drv.pm = &xo15_sci_pm, }; static int __init xo15_sci_init(void) -- cgit v1.2.3-70-g09d2 From 1551df646dd42122e17401013dba7a509d0f1b0d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 15 Jul 2012 15:56:46 +0300 Subject: apic: add apic_set_eoi_write for PV use KVM PV EOI optimization overrides eoi_write apic op with its own version. Add an API for this to avoid meddling with core x86 apic driver data structures directly. For KVM use, we don't need any guarantees about when the switch to the new op will take place, so it could in theory use this API after SMP init, but it currently doesn't, and restricting callers to early init makes it clear that it's safe as it won't race with actual APIC driver use. Signed-off-by: Michael S. Tsirkin Acked-by: Ingo Molnar Signed-off-by: Avi Kivity --- arch/x86/include/asm/apic.h | 3 +++ arch/x86/kernel/apic/apic.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index aa5b2eec360..ff8dff645e8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -469,6 +469,8 @@ static inline u32 safe_apic_wait_icr_idle(void) return apic->safe_wait_icr_idle(); } +extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -478,6 +480,7 @@ static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } +static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e094a..c7520b6184e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,6 +2123,23 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } +/* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->eoi_write = eoi_write; + } +} + /* * Power management */ -- cgit v1.2.3-70-g09d2 From 90536664063641bf87d8ac9e109f3f109f804d3e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 15 Jul 2012 15:56:52 +0300 Subject: KVM guest: switch to apic_set_eoi_write, apic_write Use apic_set_eoi_write, apic_write to avoid meedling in core apic driver data structures directly. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 299cf147092..c1d61ee4b4f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -299,7 +299,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) return; - apic->write(APIC_EOI, APIC_EOI_ACK); + apic_write(APIC_EOI, APIC_EOI_ACK); } void __cpuinit kvm_guest_cpu_init(void) @@ -466,15 +466,8 @@ void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - /* Should happen once for each apic */ - WARN_ON((*drv)->eoi_write == kvm_guest_apic_eoi_write); - (*drv)->eoi_write = kvm_guest_apic_eoi_write; - } - } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; -- cgit v1.2.3-70-g09d2 From ebf7d2e9939f6571ef0b7381e7f95eb10f0c686b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 15 Jul 2012 15:56:58 +0300 Subject: Revert "apic: fix kvm build on UP without IOAPIC" This reverts commit f9808b7fd422b965cea52e05ba470e0a473c53d3. After commit 'kvm: switch to apic_set_eoi_write, apic_write' the stubs are no longer needed as kvm does not look at apicdrivers anymore. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/include/asm/apic.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ff8dff645e8..839b8f58a27 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -417,12 +417,7 @@ extern struct apic *apic; __aligned(sizeof(struct apic *)) \ __section(.apicdrivers) = { &sym1, &sym2 } -#ifdef CONFIG_X86_LOCAL_APIC extern struct apic *__apicdrivers[], *__apicdrivers_end[]; -#else -#define __apicdrivers ((struct apic **)NULL) -#define __apicdrivers_end ((struct apic **)NULL) -#endif /* * APIC functionality to boot other CPUs - only used on SMP: -- cgit v1.2.3-70-g09d2 From d63d3e6217c49b81d74141b7920bbe5950532432 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Jul 2012 11:48:50 +0300 Subject: x86, hyper: fix build with !CONFIG_KVM_GUEST Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/cpu/hypervisor.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 6d6dd7afb22..a8f8fa9769d 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -37,7 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST &x86_hyper_kvm, +#endif }; const struct hypervisor_x86 *x86_hyper; -- cgit v1.2.3-70-g09d2 From cef12ee52b054282461a6d5fe7742755fa6e3bd3 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 7 Jun 2012 19:56:51 +0800 Subject: xen/mce: Add mcelog support for Xen platform When MCA error occurs, it would be handled by Xen hypervisor first, and then the error information would be sent to initial domain for logging. This patch gets error information from Xen hypervisor and convert Xen format error into Linux format mcelog. This logic is basically self-contained, not touching other kernel components. By using tools like mcelog tool users could read specific error information, like what they did under native Linux. To test follow directions outlined in Documentation/acpi/apei/einj.txt Acked-and-tested-by: Borislav Petkov Signed-off-by: Ke, Liping Signed-off-by: Jiang, Yunhong Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Liu, Jinsong Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/hypercall.h | 8 + arch/x86/kernel/cpu/mcheck/mce.c | 4 +- arch/x86/xen/enlighten.c | 5 +- drivers/xen/Kconfig | 8 + drivers/xen/Makefile | 1 + drivers/xen/mcelog.c | 392 +++++++++++++++++++++++++++++++++++ include/linux/miscdevice.h | 1 + include/xen/interface/xen-mca.h | 385 ++++++++++++++++++++++++++++++++++ 8 files changed, 798 insertions(+), 6 deletions(-) create mode 100644 drivers/xen/mcelog.c create mode 100644 include/xen/interface/xen-mca.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 5728852fb90..59c226d120c 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -48,6 +48,7 @@ #include #include #include +#include /* * The hypercall asms have to meet several constraints: @@ -301,6 +302,13 @@ HYPERVISOR_set_timer_op(u64 timeout) return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); } +static inline int +HYPERVISOR_mca(struct xen_mc *mc_op) +{ + mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; + return _hypercall1(int, mca, mc_op); +} + static inline int HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index da27c5d2168..aa7548799af 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -57,8 +57,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); int mce_disabled __read_mostly; -#define MISC_MCELOG_MINOR 227 - #define SPINUNIT 100 /* 100ns */ atomic_t mce_entry; @@ -2342,7 +2340,7 @@ static __init int mcheck_init_device(void) return err; } -device_initcall(mcheck_init_device); +device_initcall_sync(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4b821..9a6346865c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -341,9 +342,7 @@ static void __init xen_init_cpuid_mask(void) unsigned int xsave_mask; cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MCE) | /* disable MCE */ - (1 << X86_FEATURE_MCA) | /* disable MCA */ - (1 << X86_FEATURE_MTRR) | /* disable MTRR */ + ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ if (!xen_initial_domain()) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 8d2501e604d..d4dffcd5287 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -196,4 +196,12 @@ config XEN_ACPI_PROCESSOR called xen_acpi_processor If you do not know what to choose, select M here. If the CPUFREQ drivers are built in, select Y here. +config XEN_MCE_LOG + bool "Xen platform mcelog" + depends on XEN_DOM0 && X86_64 && X86_MCE + default n + help + Allow kernel fetching MCE error from Xen platform and + converting it into Linux mcelog format for mcelog tools + endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index fc348863113..a7870292bc7 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_XEN_PVHVM) += platform-pci.o obj-$(CONFIG_XEN_TMEM) += tmem.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_DOM0) += pci.o acpi.o +obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c new file mode 100644 index 00000000000..72e87d2f192 --- /dev/null +++ b/drivers/xen/mcelog.c @@ -0,0 +1,392 @@ +/****************************************************************************** + * mcelog.c + * Driver for receiving and transferring machine check error infomation + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong + * Author: Jiang, Yunhong + * Author: Ke, Liping + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define XEN_MCELOG "xen_mcelog: " + +static struct mc_info g_mi; +static struct mcinfo_logical_cpu *g_physinfo; +static uint32_t ncpus; + +static DEFINE_SPINLOCK(mcelog_lock); + +static struct xen_mce_log xen_mcelog = { + .signature = XEN_MCE_LOG_SIGNATURE, + .len = XEN_MCE_LOG_LEN, + .recordlen = sizeof(struct xen_mce), +}; + +static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock); +static int xen_mce_chrdev_open_count; /* #times opened */ +static int xen_mce_chrdev_open_exclu; /* already open exclusive? */ + +static int xen_mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + if (xen_mce_chrdev_open_exclu || + (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&xen_mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + xen_mce_chrdev_open_exclu = 1; + xen_mce_chrdev_open_count++; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int xen_mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + xen_mce_chrdev_open_count--; + xen_mce_chrdev_open_exclu = 0; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return 0; +} + +static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned num; + int i, err; + + spin_lock(&mcelog_lock); + + num = xen_mcelog.next; + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce)) + goto out; + + err = 0; + for (i = 0; i < num; i++) { + struct xen_mce *m = &xen_mcelog.entry[i]; + + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); + } + + memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce)); + xen_mcelog.next = 0; + + if (err) + err = -EFAULT; + +out: + spin_unlock(&mcelog_lock); + + return err ? err : buf - ubuf; +} + +static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct xen_mce), p); + case MCE_GET_LOG_LEN: + return put_user(XEN_MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = xen_mcelog.flags; + } while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static const struct file_operations xen_mce_chrdev_ops = { + .open = xen_mce_chrdev_open, + .release = xen_mce_chrdev_release, + .read = xen_mce_chrdev_read, + .unlocked_ioctl = xen_mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice xen_mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &xen_mce_chrdev_ops, +}; + +/* + * Caller should hold the mcelog_lock + */ +static void xen_mce_log(struct xen_mce *mce) +{ + unsigned entry; + + entry = xen_mcelog.next; + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= XEN_MCE_LOG_LEN) { + set_bit(XEN_MCE_OVERFLOW, + (unsigned long *)&xen_mcelog.flags); + return; + } + + memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce)); + + xen_mcelog.next++; +} + +static int convert_log(struct mc_info *mi) +{ + struct mcinfo_common *mic; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + struct xen_mce m; + uint32_t i; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); + if (unlikely(!mic)) { + pr_warning(XEN_MCELOG "Failed to find global error info\n"); + return -ENODEV; + } + + memset(&m, 0, sizeof(struct xen_mce)); + + mc_global = (struct mcinfo_global *)mic; + m.mcgstatus = mc_global->mc_gstatus; + m.apicid = mc_global->mc_apicid; + + for (i = 0; i < ncpus; i++) + if (g_physinfo[i].mc_apicid == m.apicid) + break; + if (unlikely(i == ncpus)) { + pr_warning(XEN_MCELOG "Failed to match cpu with apicid %d\n", + m.apicid); + return -ENODEV; + } + + m.socketid = g_physinfo[i].mc_chipid; + m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; + m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; + m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); + if (unlikely(!mic)) { + pr_warning(XEN_MCELOG "Fail to find bank error info\n"); + return -ENODEV; + } + + do { + if ((!mic) || (mic->size == 0) || + (mic->type != MC_TYPE_GLOBAL && + mic->type != MC_TYPE_BANK && + mic->type != MC_TYPE_EXTENDED && + mic->type != MC_TYPE_RECOVERY)) + break; + + if (mic->type == MC_TYPE_BANK) { + mc_bank = (struct mcinfo_bank *)mic; + m.misc = mc_bank->mc_misc; + m.status = mc_bank->mc_status; + m.addr = mc_bank->mc_addr; + m.tsc = mc_bank->mc_tsc; + m.bank = mc_bank->mc_bank; + m.finished = 1; + /*log this record*/ + xen_mce_log(&m); + } + mic = x86_mcinfo_next(mic); + } while (1); + + return 0; +} + +static int mc_queue_handle(uint32_t flags) +{ + struct xen_mc mc_op; + int ret = 0; + + mc_op.cmd = XEN_MC_fetch; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi); + do { + mc_op.u.mc_fetch.flags = flags; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err(XEN_MCELOG "Failed to fetch %s error log\n", + (flags == XEN_MC_URGENT) ? + "urgnet" : "nonurgent"); + break; + } + + if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + break; + else { + ret = convert_log(&g_mi); + if (ret) + pr_warning(XEN_MCELOG + "Failed to convert this error log, " + "continue acking it anyway\n"); + + mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err(XEN_MCELOG + "Failed to ack previous error log\n"); + break; + } + } + } while (1); + + return ret; +} + +/* virq handler for machine check error info*/ +static irqreturn_t xen_mce_interrupt(int irq, void *dev_id) +{ + int err; + unsigned long tmp; + + spin_lock_irqsave(&mcelog_lock, tmp); + + /* urgent mc_info */ + err = mc_queue_handle(XEN_MC_URGENT); + if (err) + pr_err(XEN_MCELOG + "Failed to handle urgent mc_info queue, " + "continue handling nonurgent mc_info queue anyway.\n"); + + /* nonurgent mc_info */ + err = mc_queue_handle(XEN_MC_NONURGENT); + if (err) + pr_err(XEN_MCELOG + "Failed to handle nonurgent mc_info queue.\n"); + + spin_unlock_irqrestore(&mcelog_lock, tmp); + + return IRQ_HANDLED; +} + +static int bind_virq_for_mce(void) +{ + int ret; + struct xen_mc mc_op; + + memset(&mc_op, 0, sizeof(struct xen_mc)); + + /* Fetch physical CPU Numbers */ + mc_op.cmd = XEN_MC_physcpuinfo; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err(XEN_MCELOG "Failed to get CPU numbers\n"); + return ret; + } + + /* Fetch each CPU Physical Info for later reference*/ + ncpus = mc_op.u.mc_physcpuinfo.ncpus; + g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu), + GFP_KERNEL); + if (!g_physinfo) + return -ENOMEM; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err(XEN_MCELOG "Failed to get CPU info\n"); + kfree(g_physinfo); + return ret; + } + + ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, + xen_mce_interrupt, 0, "mce", NULL); + if (ret < 0) { + pr_err(XEN_MCELOG "Failed to bind virq\n"); + kfree(g_physinfo); + return ret; + } + + return 0; +} + +static int __init xen_late_init_mcelog(void) +{ + /* Only DOM0 is responsible for MCE logging */ + if (xen_initial_domain()) { + /* register character device /dev/mcelog for xen mcelog */ + if (misc_register(&xen_mce_chrdev_device)) + return -ENODEV; + return bind_virq_for_mce(); + } + + return -ENODEV; +} +device_initcall(xen_late_init_mcelog); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 0549d211550..e0deeb2cc93 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -35,6 +35,7 @@ #define MPT_MINOR 220 #define MPT2SAS_MINOR 221 #define UINPUT_MINOR 223 +#define MISC_MCELOG_MINOR 227 #define HPET_MINOR 228 #define FUSE_MINOR 229 #define KVM_MINOR 232 diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h new file mode 100644 index 00000000000..73a4ea714d9 --- /dev/null +++ b/include/xen/interface/xen-mca.h @@ -0,0 +1,385 @@ +/****************************************************************************** + * arch-x86/mca.h + * Guest OS machine check interface to x86 Xen. + * + * Contributed by Advanced Micro Devices, Inc. + * Author: Christoph Egger + * + * Updated by Intel Corporation + * Author: Liu, Jinsong + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__ +#define __XEN_PUBLIC_ARCH_X86_MCA_H__ + +/* Hypercall */ +#define __HYPERVISOR_mca __HYPERVISOR_arch_0 + +#define XEN_MCA_INTERFACE_VERSION 0x01ecc003 + +/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */ +#define XEN_MC_NONURGENT 0x1 +/* IN: Dom0 calls hypercall to retrieve urgent error log entry */ +#define XEN_MC_URGENT 0x2 +/* IN: Dom0 acknowledges previosly-fetched error log entry */ +#define XEN_MC_ACK 0x4 + +/* OUT: All is ok */ +#define XEN_MC_OK 0x0 +/* OUT: Domain could not fetch data. */ +#define XEN_MC_FETCHFAILED 0x1 +/* OUT: There was no machine check data to fetch. */ +#define XEN_MC_NODATA 0x2 + +#ifndef __ASSEMBLY__ +/* vIRQ injected to Dom0 */ +#define VIRQ_MCA VIRQ_ARCH_0 + +/* + * mc_info entry types + * mca machine check info are recorded in mc_info entries. + * when fetch mca info, it can use MC_TYPE_... to distinguish + * different mca info. + */ +#define MC_TYPE_GLOBAL 0 +#define MC_TYPE_BANK 1 +#define MC_TYPE_EXTENDED 2 +#define MC_TYPE_RECOVERY 3 + +struct mcinfo_common { + uint16_t type; /* structure type */ + uint16_t size; /* size of this struct in bytes */ +}; + +#define MC_FLAG_CORRECTABLE (1 << 0) +#define MC_FLAG_UNCORRECTABLE (1 << 1) +#define MC_FLAG_RECOVERABLE (1 << 2) +#define MC_FLAG_POLLED (1 << 3) +#define MC_FLAG_RESET (1 << 4) +#define MC_FLAG_CMCI (1 << 5) +#define MC_FLAG_MCE (1 << 6) + +/* contains x86 global mc information */ +struct mcinfo_global { + struct mcinfo_common common; + + uint16_t mc_domid; /* running domain at the time in error */ + uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ + uint32_t mc_socketid; /* physical socket of the physical core */ + uint16_t mc_coreid; /* physical impacted core */ + uint16_t mc_core_threadid; /* core thread of physical core */ + uint32_t mc_apicid; + uint32_t mc_flags; + uint64_t mc_gstatus; /* global status */ +}; + +/* contains x86 bank mc information */ +struct mcinfo_bank { + struct mcinfo_common common; + + uint16_t mc_bank; /* bank nr */ + uint16_t mc_domid; /* domain referenced by mc_addr if valid */ + uint64_t mc_status; /* bank status */ + uint64_t mc_addr; /* bank address */ + uint64_t mc_misc; + uint64_t mc_ctrl2; + uint64_t mc_tsc; +}; + +struct mcinfo_msr { + uint64_t reg; /* MSR */ + uint64_t value; /* MSR value */ +}; + +/* contains mc information from other or additional mc MSRs */ +struct mcinfo_extended { + struct mcinfo_common common; + uint32_t mc_msrs; /* Number of msr with valid values. */ + /* + * Currently Intel extended MSR (32/64) include all gp registers + * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be + * useful at present. So expand this array to 16/32 to leave room. + */ + struct mcinfo_msr mc_msr[sizeof(void *) * 4]; +}; + +/* Recovery Action flags. Giving recovery result information to DOM0 */ + +/* Xen takes successful recovery action, the error is recovered */ +#define REC_ACTION_RECOVERED (0x1 << 0) +/* No action is performed by XEN */ +#define REC_ACTION_NONE (0x1 << 1) +/* It's possible DOM0 might take action ownership in some case */ +#define REC_ACTION_NEED_RESET (0x1 << 2) + +/* + * Different Recovery Action types, if the action is performed successfully, + * REC_ACTION_RECOVERED flag will be returned. + */ + +/* Page Offline Action */ +#define MC_ACTION_PAGE_OFFLINE (0x1 << 0) +/* CPU offline Action */ +#define MC_ACTION_CPU_OFFLINE (0x1 << 1) +/* L3 cache disable Action */ +#define MC_ACTION_CACHE_SHRINK (0x1 << 2) + +/* + * Below interface used between XEN/DOM0 for passing XEN's recovery action + * information to DOM0. + */ +struct page_offline_action { + /* Params for passing the offlined page number to DOM0 */ + uint64_t mfn; + uint64_t status; +}; + +struct cpu_offline_action { + /* Params for passing the identity of the offlined CPU to DOM0 */ + uint32_t mc_socketid; + uint16_t mc_coreid; + uint16_t mc_core_threadid; +}; + +#define MAX_UNION_SIZE 16 +struct mcinfo_recovery { + struct mcinfo_common common; + uint16_t mc_bank; /* bank nr */ + uint8_t action_flags; + uint8_t action_types; + union { + struct page_offline_action page_retire; + struct cpu_offline_action cpu_offline; + uint8_t pad[MAX_UNION_SIZE]; + } action_info; +}; + + +#define MCINFO_MAXSIZE 768 +struct mc_info { + /* Number of mcinfo_* entries in mi_data */ + uint32_t mi_nentries; + uint32_t flags; + uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8]; +}; +DEFINE_GUEST_HANDLE_STRUCT(mc_info); + +#define __MC_MSR_ARRAYSIZE 8 +#define __MC_MSR_MCGCAP 0 +#define __MC_NMSRS 1 +#define MC_NCAPS 7 +struct mcinfo_logical_cpu { + uint32_t mc_cpunr; + uint32_t mc_chipid; + uint16_t mc_coreid; + uint16_t mc_threadid; + uint32_t mc_apicid; + uint32_t mc_clusterid; + uint32_t mc_ncores; + uint32_t mc_ncores_active; + uint32_t mc_nthreads; + uint32_t mc_cpuid_level; + uint32_t mc_family; + uint32_t mc_vendor; + uint32_t mc_model; + uint32_t mc_step; + char mc_vendorid[16]; + char mc_brandid[64]; + uint32_t mc_cpu_caps[MC_NCAPS]; + uint32_t mc_cache_size; + uint32_t mc_cache_alignment; + uint32_t mc_nmsrvals; + struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE]; +}; +DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu); + +/* + * Prototype: + * uint32_t x86_mcinfo_nentries(struct mc_info *mi); + */ +#define x86_mcinfo_nentries(_mi) \ + ((_mi)->mi_nentries) +/* + * Prototype: + * struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi); + */ +#define x86_mcinfo_first(_mi) \ + ((struct mcinfo_common *)(_mi)->mi_data) +/* + * Prototype: + * struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic); + */ +#define x86_mcinfo_next(_mic) \ + ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)) + +/* + * Prototype: + * void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type); + */ +static inline void x86_mcinfo_lookup(struct mcinfo_common **ret, + struct mc_info *mi, uint16_t type) +{ + uint32_t i; + struct mcinfo_common *mic; + bool found = 0; + + if (!ret || !mi) + return; + + mic = x86_mcinfo_first(mi); + for (i = 0; i < x86_mcinfo_nentries(mi); i++) { + if (mic->type == type) { + found = 1; + break; + } + mic = x86_mcinfo_next(mic); + } + + *ret = found ? mic : NULL; +} + +/* + * Fetch machine check data from hypervisor. + */ +#define XEN_MC_fetch 1 +struct xen_mc_fetch { + /* + * IN: XEN_MC_NONURGENT, XEN_MC_URGENT, + * XEN_MC_ACK if ack'king an earlier fetch + * OUT: XEN_MC_OK, XEN_MC_FETCHAILED, XEN_MC_NODATA + */ + uint32_t flags; + uint32_t _pad0; + /* OUT: id for ack, IN: id we are ack'ing */ + uint64_t fetch_id; + + /* OUT variables. */ + GUEST_HANDLE(mc_info) data; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch); + + +/* + * This tells the hypervisor to notify a DomU about the machine check error + */ +#define XEN_MC_notifydomain 2 +struct xen_mc_notifydomain { + /* IN variables */ + uint16_t mc_domid; /* The unprivileged domain to notify */ + uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify */ + + /* IN/OUT variables */ + uint32_t flags; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain); + +#define XEN_MC_physcpuinfo 3 +struct xen_mc_physcpuinfo { + /* IN/OUT */ + uint32_t ncpus; + uint32_t _pad0; + /* OUT */ + GUEST_HANDLE(mcinfo_logical_cpu) info; +}; + +#define XEN_MC_msrinject 4 +#define MC_MSRINJ_MAXMSRS 8 +struct xen_mc_msrinject { + /* IN */ + uint32_t mcinj_cpunr; /* target processor id */ + uint32_t mcinj_flags; /* see MC_MSRINJ_F_* below */ + uint32_t mcinj_count; /* 0 .. count-1 in array are valid */ + uint32_t _pad0; + struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS]; +}; + +/* Flags for mcinj_flags above; bits 16-31 are reserved */ +#define MC_MSRINJ_F_INTERPOSE 0x1 + +#define XEN_MC_mceinject 5 +struct xen_mc_mceinject { + unsigned int mceinj_cpunr; /* target processor id */ +}; + +struct xen_mc { + uint32_t cmd; + uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */ + union { + struct xen_mc_fetch mc_fetch; + struct xen_mc_notifydomain mc_notifydomain; + struct xen_mc_physcpuinfo mc_physcpuinfo; + struct xen_mc_msrinject mc_msrinject; + struct xen_mc_mceinject mc_mceinject; + } u; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_mc); + +/* Fields are zero when not available */ +struct xen_mce { + __u64 status; + __u64 misc; + __u64 addr; + __u64 mcgstatus; + __u64 ip; + __u64 tsc; /* cpu time stamp counter */ + __u64 time; /* wall time_t when error was detected */ + __u8 cpuvendor; /* cpu vendor as encoded in system.h */ + __u8 inject_flags; /* software inject flags */ + __u16 pad; + __u32 cpuid; /* CPUID 1 EAX */ + __u8 cs; /* code segment */ + __u8 bank; /* machine check bank */ + __u8 cpu; /* cpu number; obsolete; use extcpu now */ + __u8 finished; /* entry is valid */ + __u32 extcpu; /* linux cpu number that detected the error */ + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial apic ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ +}; + +/* + * This structure contains all data related to the MCE log. Also + * carries a signature to make it easier to find from external + * debugging tools. Each entry is only valid when its finished flag + * is set. + */ + +#define XEN_MCE_LOG_LEN 32 + +struct xen_mce_log { + char signature[12]; /* "MACHINECHECK" */ + unsigned len; /* = XEN_MCE_LOG_LEN */ + unsigned next; + unsigned flags; + unsigned recordlen; /* length of struct xen_mce */ + struct xen_mce entry[XEN_MCE_LOG_LEN]; +}; + +#define XEN_MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ + +#define XEN_MCE_LOG_SIGNATURE "MACHINECHECK" + +#define MCE_GET_RECORD_LEN _IOR('M', 1, int) +#define MCE_GET_LOG_LEN _IOR('M', 2, int) +#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int) + +#endif /* __ASSEMBLY__ */ +#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */ -- cgit v1.2.3-70-g09d2 From a8fccdb0617386695a13ec742a61b5c935b63795 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 7 Jun 2012 19:58:50 +0800 Subject: x86, MCE, AMD: Adjust initcall sequence for xen there are 3 funcs which need to be _initcalled in a logic sequence: 1. xen_late_init_mcelog 2. mcheck_init_device 3. threshold_init_device xen_late_init_mcelog must register xen_mce_chrdev_device before native mce_chrdev_device registration if running under xen platform; mcheck_init_device should be inited before threshold_init_device to initialize mce_device, otherwise a a NULL ptr dereference will cause panic. so we use following _initcalls 1. device_initcall(xen_late_init_mcelog); 2. device_initcall_sync(mcheck_init_device); 3. late_initcall(threshold_init_device); when running under xen, the initcall order is 1,2,3; on baremetal, we skip 1 and we do only 2 and 3. Acked-and-tested-by: Borislav Petkov Suggested-by: Konrad Rzeszutek Wilk Signed-off-by: Liu, Jinsong Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f4873a64f46..be527449042 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -777,4 +777,24 @@ static __init int threshold_init_device(void) return 0; } -device_initcall(threshold_init_device); +/* + * there are 3 funcs which need to be _initcalled in a logic sequence: + * 1. xen_late_init_mcelog + * 2. mcheck_init_device + * 3. threshold_init_device + * + * xen_late_init_mcelog must register xen_mce_chrdev_device before + * native mce_chrdev_device registration if running under xen platform; + * + * mcheck_init_device should be inited before threshold_init_device to + * initialize mce_device, otherwise a NULL ptr dereference will cause panic. + * + * so we use following _initcalls + * 1. device_initcall(xen_late_init_mcelog); + * 2. device_initcall_sync(mcheck_init_device); + * 3. late_initcall(threshold_init_device); + * + * when running under xen, the initcall order is 1,2,3; + * on baremetal, we skip 1 and we do only 2 and 3. + */ +late_initcall(threshold_init_device); -- cgit v1.2.3-70-g09d2 From 05e36006adc3046f86f2be8652a22d5f77ebd6ea Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 7 Jun 2012 20:10:37 +0800 Subject: xen/mce: Register native mce handler as vMCE bounce back point When Xen hypervisor inject vMCE to guest, use native mce handler to handle it Signed-off-by: Ke, Liping Signed-off-by: Jiang, Yunhong Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Liu, Jinsong Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9a6346865c4..0c1ab43f667 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -626,8 +626,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, /* * Look for known traps using IST, and substitute them * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault and - * machine_check, so we should never see them. Warn if + * about. Xen will handle faults like double_fault, + * so we should never see them. Warn if * there's an unexpected IST-using fault handler. */ if (addr == (unsigned long)debug) @@ -642,7 +642,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, return 0; #ifdef CONFIG_X86_MCE } else if (addr == (unsigned long)machine_check) { - return 0; + /* + * when xen hypervisor inject vMCE to guest, + * use native mce handler to handle it + */ + ; #endif } else { /* Some other trap using IST? */ -- cgit v1.2.3-70-g09d2 From d095d43e78dd811d5c02c25e207c3364019b5a77 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 9 Jul 2012 11:39:05 +0100 Subject: xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable In xen_set_pte() if batching is unavailable (because the caller is in an interrupt context such as handling a page fault) it would fall back to using native_set_pte() and trapping and emulating the PTE write. On 32-bit guests this requires two traps for each PTE write (one for each dword of the PTE). Instead, do one mmu_update hypercall directly. During construction of the initial page tables, continue to use native_set_pte() because most of the PTEs being set are in writable and unpinned pages (see phys_pmd_init() in arch/x86/mm/init_64.c) and using a hypercall for this is very expensive. This significantly improves page fault performance in 32-bit PV guests. lmbench3 test Before After Improvement ---------------------------------------------- lat_pagefault 3.18 us 2.32 us 27% lat_proc fork 356 us 313.3 us 11% Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785631c..3f1783a79a3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) { - if (!xen_batched_set_pte(ptep, pteval)) - native_set_pte(ptep, pteval); + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); + } } static void xen_set_pte(pte_t *ptep, pte_t pteval) @@ -1416,13 +1428,21 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) } #endif /* CONFIG_X86_64 */ -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { pte = mask_rw_pte(ptep, pte); - xen_set_pte(ptep, pte); + native_set_pte(ptep, pte); } static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -- cgit v1.2.3-70-g09d2 From 66a27dde9ae96e35278983f2e59bea04eb714cd0 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 9 Jul 2012 11:39:06 +0100 Subject: xen/mm: zero PTEs for non-present MFNs in the initial page table When constructing the initial page tables, if the MFN for a usable PFN is missing in the p2m then that frame is initially ballooned out. In this case, zero the PTE (as in decrease_reservation() in drivers/xen/balloon.c). This is obviously safe instead of having an valid PTE with an MFN of INVALID_P2M_ENTRY (~0). Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f1783a79a3..27336dfcda8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1432,6 +1432,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) * Init-time set_pte while constructing initial pagetables, which * doesn't allow RO page table pages to be remapped RW. * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * * Many of these PTE updates are done on unpinned and writable pages * and doing a hypercall for these is unnecessary and expensive. At * this point it is not possible to tell if a page is pinned or not, @@ -1440,7 +1444,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) */ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { - pte = mask_rw_pte(ptep, pte); + if (pte_mfn(pte) != INVALID_P2M_ENTRY) + pte = mask_rw_pte(ptep, pte); + else + pte = __pte_ma(0); native_set_pte(ptep, pte); } -- cgit v1.2.3-70-g09d2 From 59290362da587ce2e6d2f4a7f85e362fa2d7fd39 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 9 Jul 2012 11:39:07 +0100 Subject: xen/x86: add desc_equal() to compare GDT descriptors Signed-off-by: David Vrabel [v1: Moving it to the Xen file] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0c1ab43f667..225ffdc3c4c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -539,6 +539,12 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG(); } +static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) +{ + return d1->a == d2->a && d1->b == d2->b; +} + static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { -- cgit v1.2.3-70-g09d2 From 1c32cdc633c96a14cca35de9db19338354c25111 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 9 Jul 2012 11:39:08 +0100 Subject: xen/x86: avoid updating TLS descriptors if they haven't changed When switching tasks in a Xen PV guest, avoid updating the TLS descriptors if they haven't changed. This improves the speed of context switches by almost 10% as much of the time the descriptors are the same or only one is different. The descriptors written into the GDT by Xen are modified from the values passed in the update_descriptor hypercall so we keep shadow copies of the three TLS descriptors to compare against. lmbench3 test Before After Improvement -------------------------------------------- lat_ctx -s 32 24 7.19 6.52 9% lat_pipe 12.56 11.66 7% Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 225ffdc3c4c..9c808693afa 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -125,6 +125,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; +struct tls_descs { + struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed. Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -548,9 +561,19 @@ static inline bool desc_equal(const struct desc_struct *d1, static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - struct multicall_space mc = __xen_mc_entry(0); + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; + struct desc_struct *gdt; + xmaddr_t maddr; + struct multicall_space mc; + + if (desc_equal(shadow, &t->tls_array[i])) + return; + + *shadow = t->tls_array[i]; + + gdt = get_cpu_gdt_table(cpu); + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); } -- cgit v1.2.3-70-g09d2 From 4648da7cb4079e263eaf4dcd3b10fdb2409d4ad6 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 17 Jul 2012 11:57:41 +0200 Subject: xen: remove cast from HYPERVISOR_shared_info assignment Both have type struct shared_info so no cast is needed. Signed-off-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9c808693afa..affa94b29d0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback); * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. */ -struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; /* * Flag to determine whether vcpu info placement is available on all -- cgit v1.2.3-70-g09d2 From 4ff2d06255461390ad685843d0d7364aaa6642d2 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 17 Jul 2012 11:59:15 +0200 Subject: xen: simplify init_hvm_pv_info init_hvm_pv_info is called only in PVonHVM context, move it into ifdef. init_hvm_pv_info does not fail, make it a void function. remove arguments from init_hvm_pv_info because they are not used by the caller. Signed-off-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 58 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index affa94b29d0..f1814fc2cb7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1471,32 +1471,6 @@ asmlinkage void __init xen_start_kernel(void) #endif } -static int init_hvm_pv_info(int *major, int *minor) -{ - uint32_t eax, ebx, ecx, edx, pages, msr, base; - u64 pfn; - - base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); - - *major = eax >> 16; - *minor = eax & 0xffff; - printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); - - cpuid(base + 2, &pages, &msr, &ecx, &edx); - - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - xen_setup_features(); - - pv_info.name = "Xen HVM"; - - xen_domain_type = XEN_HVM_DOMAIN; - - return 0; -} - void __ref xen_hvm_init_shared_info(void) { int cpu; @@ -1529,6 +1503,31 @@ void __ref xen_hvm_init_shared_info(void) } #ifdef CONFIG_XEN_PVHVM +static void __init init_hvm_pv_info(void) +{ + int major, minor; + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + major = eax >> 16; + minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info.name = "Xen HVM"; + + xen_domain_type = XEN_HVM_DOMAIN; +} + static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1551,12 +1550,7 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { static void __init xen_hvm_guest_init(void) { - int r; - int major, minor; - - r = init_hvm_pv_info(&major, &minor); - if (r < 0) - return; + init_hvm_pv_info(); xen_hvm_init_shared_info(); -- cgit v1.2.3-70-g09d2 From 00e37bdb0113a98408de42db85be002f21dbffd3 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 17 Jul 2012 17:43:35 +0200 Subject: xen PVonHVM: move shared_info to MMIO before kexec Currently kexec in a PVonHVM guest fails with a triple fault because the new kernel overwrites the shared info page. The exact failure depends on the size of the kernel image. This patch moves the pfn from RAM into MMIO space before the kexec boot. The pfn containing the shared_info is located somewhere in RAM. This will cause trouble if the current kernel is doing a kexec boot into a new kernel. The new kernel (and its startup code) can not know where the pfn is, so it can not reserve the page. The hypervisor will continue to update the pfn, and as a result memory corruption occours in the new kernel. One way to work around this issue is to allocate a page in the xen-platform pci device's BAR memory range. But pci init is done very late and the shared_info page is already in use very early to read the pvclock. So moving the pfn from RAM to MMIO is racy because some code paths on other vcpus could access the pfn during the small window when the old pfn is moved to the new pfn. There is even a small window were the old pfn is not backed by a mfn, and during that time all reads return -1. Because it is not known upfront where the MMIO region is located it can not be used right from the start in xen_hvm_init_shared_info. To minimise trouble the move of the pfn is done shortly before kexec. This does not eliminate the race because all vcpus are still online when the syscore_ops will be called. But hopefully there is no work pending at this point in time. Also the syscore_op is run last which reduces the risk further. Signed-off-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 118 ++++++++++++++++++++++++++++++++++++++++----- arch/x86/xen/suspend.c | 2 +- arch/x86/xen/xen-ops.h | 2 +- drivers/xen/platform-pci.c | 15 ++++++ include/xen/events.h | 2 + 5 files changed, 126 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f1814fc2cb7..a6f8acbdfc9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -1471,38 +1472,130 @@ asmlinkage void __init xen_start_kernel(void) #endif } -void __ref xen_hvm_init_shared_info(void) +#ifdef CONFIG_XEN_PVHVM +/* + * The pfn containing the shared_info is located somewhere in RAM. This + * will cause trouble if the current kernel is doing a kexec boot into a + * new kernel. The new kernel (and its startup code) can not know where + * the pfn is, so it can not reserve the page. The hypervisor will + * continue to update the pfn, and as a result memory corruption occours + * in the new kernel. + * + * One way to work around this issue is to allocate a page in the + * xen-platform pci device's BAR memory range. But pci init is done very + * late and the shared_info page is already in use very early to read + * the pvclock. So moving the pfn from RAM to MMIO is racy because some + * code paths on other vcpus could access the pfn during the small + * window when the old pfn is moved to the new pfn. There is even a + * small window were the old pfn is not backed by a mfn, and during that + * time all reads return -1. + * + * Because it is not known upfront where the MMIO region is located it + * can not be used right from the start in xen_hvm_init_shared_info. + * + * To minimise trouble the move of the pfn is done shortly before kexec. + * This does not eliminate the race because all vcpus are still online + * when the syscore_ops will be called. But hopefully there is no work + * pending at this point in time. Also the syscore_op is run last which + * reduces the risk further. + */ + +static struct shared_info *xen_hvm_shared_info; + +static void xen_hvm_connect_shared_info(unsigned long pfn) { - int cpu; struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page = 0; - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = pfn; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; +} +static void xen_hvm_set_shared_info(struct shared_info *sip) +{ + int cpu; + + HYPERVISOR_shared_info = sip; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and + * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_set_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -#ifdef CONFIG_XEN_PVHVM +/* Reconnect the shared_info pfn to a mfn */ +void xen_hvm_resume_shared_info(void) +{ + xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); +} + +#ifdef CONFIG_KEXEC +static struct shared_info *xen_hvm_shared_info_kexec; +static unsigned long xen_hvm_shared_info_pfn_kexec; + +/* Remember a pfn in MMIO space for kexec reboot */ +void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) +{ + xen_hvm_shared_info_kexec = sip; + xen_hvm_shared_info_pfn_kexec = pfn; +} + +static void xen_hvm_syscore_shutdown(void) +{ + struct xen_memory_reservation reservation = { + .domid = DOMID_SELF, + .nr_extents = 1, + }; + unsigned long prev_pfn; + int rc; + + if (!xen_hvm_shared_info_kexec) + return; + + prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; + set_xen_guest_handle(reservation.extent_start, &prev_pfn); + + /* Move pfn to MMIO, disconnects previous pfn from mfn */ + xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); + + /* Update pointers, following hypercall is also a memory barrier */ + xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); + + /* Allocate new mfn for previous pfn */ + do { + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc == 0) + msleep(123); + } while (rc == 0); + + /* Make sure the previous pfn is really connected to a (new) mfn */ + BUG_ON(rc != 1); +} + +static struct syscore_ops xen_hvm_syscore_ops = { + .shutdown = xen_hvm_syscore_shutdown, +}; +#endif + +/* Use a pfn in RAM, may move to MMIO before kexec. */ +static void __init xen_hvm_init_shared_info(void) +{ + /* Remember pointer for resume */ + xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); + xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); + xen_hvm_set_shared_info(xen_hvm_shared_info); +} + static void __init init_hvm_pv_info(void) { int major, minor; @@ -1553,6 +1646,9 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); xen_hvm_init_shared_info(); +#ifdef CONFIG_KEXEC + register_syscore_ops(&xen_hvm_syscore_ops); +#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226..ae8a00c39de 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + xen_hvm_resume_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 202d4c15015..1e4329e04e0 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_init_shared_info(void); +void xen_hvm_resume_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 97ca359ae2b..d4c50d63acb 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -101,6 +101,19 @@ static int platform_pci_resume(struct pci_dev *pdev) return 0; } +static void __devinit prepare_shared_info(void) +{ +#ifdef CONFIG_KEXEC + unsigned long addr; + struct shared_info *hvm_shared_info; + + addr = alloc_xen_mmio(PAGE_SIZE); + hvm_shared_info = ioremap(addr, PAGE_SIZE); + memset(hvm_shared_info, 0, PAGE_SIZE); + xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT); +#endif +} + static int __devinit platform_pci_init(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -138,6 +151,8 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, platform_mmio = mmio_addr; platform_mmiolen = mmio_len; + prepare_shared_info(); + if (!xen_have_vector_callback) { ret = xen_allocate_irq(pdev); if (ret) { diff --git a/include/xen/events.h b/include/xen/events.h index 04399b28e82..9c641deb65d 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -58,6 +58,8 @@ void notify_remote_via_irq(int irq); void xen_irq_resume(void); +void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn); + /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); void xen_set_irq_pending(int irq); -- cgit v1.2.3-70-g09d2 From c3d93f880197953f86ab90d9da4744e926b38e33 Mon Sep 17 00:00:00 2001 From: "zhenzhong.duan" Date: Wed, 18 Jul 2012 13:06:39 +0800 Subject: xen: populate correct number of pages when across mem boundary (v2) When populate pages across a mem boundary at bootup, the page count populated isn't correct. This is due to mem populated to non-mem region and ignored. Pfn range is also wrongly aligned when mem boundary isn't page aligned. For a dom0 booted with dom_mem=3368952K(0xcd9ff000-4k) dmesg diff is: [ 0.000000] Freeing 9e-100 pfn range: 98 pages freed [ 0.000000] 1-1 mapping on 9e->100 [ 0.000000] 1-1 mapping on cd9ff->100000 [ 0.000000] Released 98 pages of unused memory [ 0.000000] Set 206435 page(s) to 1-1 mapping -[ 0.000000] Populating cd9fe-cda00 pfn range: 1 pages added +[ 0.000000] Populating cd9fe-cd9ff pfn range: 1 pages added +[ 0.000000] Populating 100000-100061 pfn range: 97 pages added [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] Xen: 0000000000000000 - 000000000009e000 (usable) [ 0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved) [ 0.000000] Xen: 0000000000100000 - 00000000cd9ff000 (usable) [ 0.000000] Xen: 00000000cd9ffc00 - 00000000cda53c00 (ACPI NVS) ... [ 0.000000] Xen: 0000000100000000 - 0000000100061000 (usable) [ 0.000000] Xen: 0000000100061000 - 000000012c000000 (unusable) ... [ 0.000000] MEMBLOCK configuration: ... -[ 0.000000] reserved[0x4] [0x000000cd9ff000-0x000000cd9ffbff], 0xc00 bytes -[ 0.000000] reserved[0x5] [0x00000100000000-0x00000100060fff], 0x61000 bytes Related xen memory layout: (XEN) Xen-e820 RAM map: (XEN) 0000000000000000 - 000000000009ec00 (usable) (XEN) 00000000000f0000 - 0000000000100000 (reserved) (XEN) 0000000000100000 - 00000000cd9ffc00 (usable) Signed-off-by: Zhenzhong Duan [v2: If xen_do_chunk fail(populate), abort this chunk and any others] Suggested by David, thanks. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a4790bf22c5..ead85576d54 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk( unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { - unsigned long credits = credits_left; unsigned long s_pfn; unsigned long e_pfn; unsigned long pfns; long capacity; - if (credits <= 0) + if (credits_left <= 0) break; if (entry->type != E820_RAM) continue; - e_pfn = PFN_UP(entry->addr + entry->size); + e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after the xen_start_info->nr_pages */ if (e_pfn <= max_pfn) continue; - s_pfn = PFN_DOWN(entry->addr); + s_pfn = PFN_UP(entry->addr); /* If the E820 falls within the nr_pages, we want to start * at the nr_pages PFN. * If that would mean going past the E820 entry, skip it @@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk( capacity = e_pfn - max_pfn; dest_pfn = max_pfn; } else { - /* last_pfn MUST be within E820_RAM regions */ - if (*last_pfn && e_pfn >= *last_pfn) - s_pfn = *last_pfn; capacity = e_pfn - s_pfn; dest_pfn = s_pfn; } - /* If we had filled this E820_RAM entry, go to the next one. */ - if (capacity <= 0) - continue; - if (credits > capacity) - credits = capacity; + if (credits_left < capacity) + capacity = credits_left; - pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false); + pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); done += pfns; - credits_left -= pfns; *last_pfn = (dest_pfn + pfns); + if (pfns < capacity) + break; + credits_left -= pfns; } return done; } -- cgit v1.2.3-70-g09d2 From 2e76c2838a2c1c6c5c220410bcd3c0d6d82e4e31 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 19 Jul 2012 22:29:11 +0200 Subject: module.c: spelling s/postition/position/g Signed-off-by: Geert Uytterhoeven Signed-off-by: Jiri Kosina --- arch/m68k/kernel/module.c | 4 ++-- arch/x86/kernel/module.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/m68k/kernel/module.c b/arch/m68k/kernel/module.c index 34849c4c6e3..eb46fd6038c 100644 --- a/arch/m68k/kernel/module.c +++ b/arch/m68k/kernel/module.c @@ -47,7 +47,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_68K_PC32: - /* Add the value, subtract its postition */ + /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; default: @@ -87,7 +87,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, *location = rel[i].r_addend + sym->st_value; break; case R_68K_PC32: - /* Add the value, subtract its postition */ + /* Add the value, subtract its position */ *location = rel[i].r_addend + sym->st_value - (uint32_t)location; break; default: diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f21fd94ac89..772e7ad5dc2 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -73,7 +73,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_386_PC32: - /* Add the value, subtract its postition */ + /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; default: -- cgit v1.2.3-70-g09d2 From 1a577b72475d161b6677c05abe57301362023bb2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 19 Jul 2012 13:45:20 +0300 Subject: KVM: fix race with level interrupts When more than 1 source id is in use for the same GSI, we have the following race related to handling irq_states race: CPU 0 clears bit 0. CPU 0 read irq_state as 0. CPU 1 sets level to 1. CPU 1 calls kvm_ioapic_set_irq(1). CPU 0 calls kvm_ioapic_set_irq(0). Now ioapic thinks the level is 0 but irq_state is not 0. Fix by performing all irq_states bitmap handling under pic/ioapic lock. This also removes the need for atomics with irq_states handling. Reported-by: Gleb Natapov Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 15 ++++++++++++++- arch/x86/kvm/i8259.c | 17 ++++++++++++++--- virt/kvm/ioapic.c | 19 ++++++++++++++++--- virt/kvm/ioapic.h | 4 +++- virt/kvm/irq_comm.c | 31 ++++--------------------------- 5 files changed, 51 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a3e9409e90b..2c75b400e40 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -816,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); -int kvm_pic_set_irq(void *opaque, int irq, int level); +static inline int __kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + __set_bit(irq_source_id, irq_state); + else + __clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + +int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); +void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 81cf4fa4a2b..1df8fb9e1d5 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -int kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { - struct kvm_pic *s = opaque; int ret = -1; pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + int irq_level = __kvm_irq_line_state(&s->irq_states[irq], + irq_source_id, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); @@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) return ret; } +void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) +{ + int i; + + pic_lock(s); + for (i = 0; i < PIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &s->irq_states[i]); + pic_unlock(s); +} + /* * acknowledge interrupt 'irq' */ diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 26fd54dc459..ef61d529a6c 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -191,7 +191,8 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); } -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level) { u32 old_irr; u32 mask = 1 << irq; @@ -201,9 +202,11 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) spin_lock(&ioapic->lock); old_irr = ioapic->irr; if (irq >= 0 && irq < IOAPIC_NUM_PINS) { + int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], + irq_source_id, level); entry = ioapic->redirtbl[irq]; - level ^= entry.fields.polarity; - if (!level) + irq_level ^= entry.fields.polarity; + if (!irq_level) ioapic->irr &= ~mask; else { int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); @@ -221,6 +224,16 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) return ret; } +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) +{ + int i; + + spin_lock(&ioapic->lock); + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &ioapic->irq_states[i]); + spin_unlock(&ioapic->lock); +} + static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, int trigger_mode) { diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 32872a09b63..a30abfe6ed1 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -74,7 +74,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level); +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a6a0365475e..cc59c68da03 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -33,26 +33,12 @@ #include "ioapic.h" -static inline int kvm_irq_line_state(unsigned long *irq_state, - int irq_source_id, int level) -{ - /* Logical OR for level trig interrupt */ - if (level) - set_bit(irq_source_id, irq_state); - else - clear_bit(irq_source_id, irq_state); - - return !!(*irq_state); -} - static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level) { #ifdef CONFIG_X86 struct kvm_pic *pic = pic_irqchip(kvm); - level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], - irq_source_id, level); - return kvm_pic_set_irq(pic, e->irqchip.pin, level); + return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); #else return -1; #endif @@ -62,10 +48,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], - irq_source_id, level); - - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level); } inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) @@ -249,8 +232,6 @@ unlock: void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) { - int i; - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); mutex_lock(&kvm->irq_lock); @@ -263,14 +244,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) if (!irqchip_in_kernel(kvm)) goto unlock; - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { - clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); - if (i >= 16) - continue; + kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); #ifdef CONFIG_X86 - clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); + kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); #endif - } unlock: mutex_unlock(&kvm->irq_lock); } -- cgit v1.2.3-70-g09d2 From 30d5c4546a7dae29a1aa76abdb69a78bb00136be Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Jul 2012 13:35:06 -0700 Subject: x86, cpufeature: Add the RDSEED and ADX features Add the RDSEED and ADX features documented in section 9.1 of the Intel Architecture Instruction Set Extensions Programming Reference, document 319433, version 013b, available from http://software.intel.com/en-us/avx/ The PREFETCHW bit is already supported in Linux under the name 3DNOWPREFETCH. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-lgr6482ufk1bvxzvc2hr8qbp@git.kernel.org --- arch/x86/include/asm/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index f91e80f4f18..6b7ee5ff682 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -207,6 +207,8 @@ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.2.3-70-g09d2 From 7efa1c87963d23cc57ba40c07316d3e28cc75a3a Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 20 Jul 2012 09:18:23 +0800 Subject: x86/tlb: Fix build warning and crash when building for !SMP The incompatible parameter of flush_tlb_mm_range cause build warning. Fix it by correct parameter. Ingo Molnar found that this could also cause a user space crash. Reported-by: Tetsuo Handa Reported-by: Ingo Molnar Signed-off-by: Alex Shi Link: http://lkml.kernel.org/r/1342747103-19765-1-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/tlbflush.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index b5a27bd7766..74a44333545 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -105,10 +105,10 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, __flush_tlb(); } -static inline void flush_tlb_mm_range(struct vm_area_struct *vma, +static inline void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { - if (vma->vm_mm == current->active_mm) + if (mm == current->active_mm) __flush_tlb(); } -- cgit v1.2.3-70-g09d2 From 9ca8f72a9297f2052d806bd1111e176533aa69bd Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 19 Jul 2012 10:23:48 +0100 Subject: x86, efi: Handover Protocol As things currently stand, traditional EFI boot loaders and the EFI boot stub are carrying essentially the same initialisation code required to setup an EFI machine for booting a kernel. There's really no need to have this code in two places and the hope is that, with this new protocol, initialisation and booting of the kernel can be left solely to the kernel's EFI boot stub. The responsibilities of the boot loader then become, o Loading the kernel image from boot media File system code still needs to be carried by boot loaders for the scenario where the kernel and initrd files reside on a file system that the EFI firmware doesn't natively understand, such as ext4, etc. o Providing a user interface Boot loaders still need to display any menus/interfaces, for example to allow the user to select from a list of kernels. Bump the boot protocol number because we added the 'handover_offset' field to indicate the location of the handover protocol entry point. Cc: H. Peter Anvin Cc: Peter Jones Cc: Ingo Molnar Signed-off-by: Matt Fleming Acked-and-Tested-by: Matthew Garrett Link: http://lkml.kernel.org/r/1342689828-16815-1-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- Documentation/x86/boot.txt | 41 ++++++++ arch/x86/boot/compressed/eboot.c | 198 ++++++++++++++++++++++--------------- arch/x86/boot/compressed/head_32.S | 10 ++ arch/x86/boot/compressed/head_64.S | 10 ++ arch/x86/boot/header.S | 4 +- arch/x86/include/asm/bootparam.h | 1 + 6 files changed, 185 insertions(+), 79 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 7c3a8801b7c..c6539a4278b 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -54,6 +54,9 @@ Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment beyond the kernel_alignment added, new init_size and pref_address fields. Added extended boot loader IDs. +Protocol 2.11: (Kernel 3.6) Added a field for offset of EFI handover + protocol entry point. + **** MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -189,6 +192,7 @@ Offset Proto Name Meaning of struct setup_data 0258/8 2.10+ pref_address Preferred loading address 0260/4 2.10+ init_size Linear memory required during initialization +0264/4 2.11+ handover_offset Offset of handover entry point (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -690,6 +694,16 @@ Offset/size: 0x260/4 else runtime_start = pref_address +Field name: handover_offset +Type: read +Offset/size: 0x264/4 + + This field is the offset from the beginning of the kernel image to + the EFI handover protocol entry point. Boot loaders using the EFI + handover protocol to boot the kernel should jump to this offset. + + See EFI HANDOVER PROTOCOL below for more details. + **** THE IMAGE CHECKSUM @@ -1010,3 +1024,30 @@ segment; __BOOS_CS must have execute/read permission, and __BOOT_DS must have read/write permission; CS must be __BOOT_CS and DS, ES, SS must be __BOOT_DS; interrupt must be disabled; %esi must hold the base address of the struct boot_params; %ebp, %edi and %ebx must be zero. + +**** EFI HANDOVER PROTOCOL + +This protocol allows boot loaders to defer initialisation to the EFI +boot stub. The boot loader is required to load the kernel/initrd(s) +from the boot media and jump to the EFI handover protocol entry point +which is hdr->handover_offset bytes from the beginning of +startup_{32,64}. + +The function prototype for the handover entry point looks like this, + + efi_main(void *handle, efi_system_table_t *table, struct boot_params *bp) + +'handle' is the EFI image handle passed to the boot loader by the EFI +firmware, 'table' is the EFI system table - these are the first two +arguments of the "handoff state" as described in section 2.3 of the +UEFI specification. 'bp' is the boot loader-allocated boot params. + +The boot loader *must* fill out the following fields in bp, + + o hdr.code32_start + o hdr.cmd_line_ptr + o hdr.cmdline_size + o hdr.ramdisk_image (if applicable) + o hdr.ramdisk_size (if applicable) + +All other fields should be zero. diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 4e85f5f8583..b3e0227df2c 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -729,32 +729,68 @@ fail: * need to create one ourselves (usually the bootloader would create * one for us). */ -static efi_status_t make_boot_params(struct boot_params *boot_params, - efi_loaded_image_t *image, - void *handle) +struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) { - struct efi_info *efi = &boot_params->efi_info; - struct apm_bios_info *bi = &boot_params->apm_bios_info; - struct sys_desc_table *sdt = &boot_params->sys_desc_table; - struct e820entry *e820_map = &boot_params->e820_map[0]; - struct e820entry *prev = NULL; - struct setup_header *hdr = &boot_params->hdr; - unsigned long size, key, desc_size, _size; - efi_memory_desc_t *mem_map; - void *options = image->load_options; - u32 load_options_size = image->load_options_size / 2; /* ASCII */ + struct boot_params *boot_params; + struct sys_desc_table *sdt; + struct apm_bios_info *bi; + struct setup_header *hdr; + struct efi_info *efi; + efi_loaded_image_t *image; + void *options; + u32 load_options_size; + efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; int options_size = 0; efi_status_t status; - __u32 desc_version; unsigned long cmdline; - u8 nr_entries; u16 *s2; u8 *s1; int i; + sys_table = _table; + + /* Check if we were booted by the EFI firmware */ + if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + return NULL; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + handle, &proto, (void *)&image); + if (status != EFI_SUCCESS) { + efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); + return NULL; + } + + status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc lowmem for boot params\n"); + return NULL; + } + + memset(boot_params, 0x0, 0x4000); + + hdr = &boot_params->hdr; + efi = &boot_params->efi_info; + bi = &boot_params->apm_bios_info; + sdt = &boot_params->sys_desc_table; + + /* Copy the second sector to boot_params */ + memcpy(&hdr->jump, image->image_base + 512, 512); + + /* + * Fill out some of the header fields ourselves because the + * EFI firmware loader doesn't load the first sector. + */ + hdr->root_flags = 1; + hdr->vid_mode = 0xffff; + hdr->boot_flag = 0xAA55; + + hdr->code32_start = (__u64)(unsigned long)image->image_base; + hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ + options = image->load_options; + load_options_size = image->load_options_size / 2; /* ASCII */ cmdline = 0; s2 = (u16 *)options; @@ -791,18 +827,36 @@ static efi_status_t make_boot_params(struct boot_params *boot_params, hdr->ramdisk_image = 0; hdr->ramdisk_size = 0; - status = handle_ramdisks(image, hdr); - if (status != EFI_SUCCESS) - goto free_cmdline; - - setup_graphics(boot_params); - /* Clear APM BIOS info */ memset(bi, 0, sizeof(*bi)); memset(sdt, 0, sizeof(*sdt)); - memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); + status = handle_ramdisks(image, hdr); + if (status != EFI_SUCCESS) + goto fail2; + + return boot_params; +fail2: + if (options_size) + low_free(options_size, hdr->cmd_line_ptr); +fail: + low_free(0x4000, (unsigned long)boot_params); + return NULL; +} + +static efi_status_t exit_boot(struct boot_params *boot_params, + void *handle) +{ + struct efi_info *efi = &boot_params->efi_info; + struct e820entry *e820_map = &boot_params->e820_map[0]; + struct e820entry *prev = NULL; + unsigned long size, key, desc_size, _size; + efi_memory_desc_t *mem_map; + efi_status_t status; + __u32 desc_version; + u8 nr_entries; + int i; size = sizeof(*mem_map) * 32; @@ -811,7 +865,7 @@ again: _size = size; status = low_alloc(size, 1, (unsigned long *)&mem_map); if (status != EFI_SUCCESS) - goto free_cmdline; + return status; status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, mem_map, &key, &desc_size, &desc_version); @@ -823,6 +877,7 @@ again: if (status != EFI_SUCCESS) goto free_mem_map; + memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); efi->efi_systab = (unsigned long)sys_table; efi->efi_memdesc_size = desc_size; efi->efi_memdesc_version = desc_version; @@ -906,61 +961,13 @@ again: free_mem_map: low_free(_size, (unsigned long)mem_map); -free_cmdline: - if (options_size) - low_free(options_size, hdr->cmd_line_ptr); -fail: return status; } -/* - * On success we return a pointer to a boot_params structure, and NULL - * on failure. - */ -struct boot_params *efi_main(void *handle, efi_system_table_t *_table) +static efi_status_t relocate_kernel(struct setup_header *hdr) { - struct boot_params *boot_params; unsigned long start, nr_pages; - struct desc_ptr *gdt, *idt; - efi_loaded_image_t *image; - struct setup_header *hdr; efi_status_t status; - efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; - struct desc_struct *desc; - - sys_table = _table; - - /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - goto fail; - - status = efi_call_phys3(sys_table->boottime->handle_protocol, - handle, &proto, (void *)&image); - if (status != EFI_SUCCESS) { - efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); - goto fail; - } - - status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc lowmem for boot params\n"); - goto fail; - } - - memset(boot_params, 0x0, 0x4000); - - hdr = &boot_params->hdr; - - /* Copy the second sector to boot_params */ - memcpy(&hdr->jump, image->image_base + 512, 512); - - /* - * Fill out some of the header fields ourselves because the - * EFI firmware loader doesn't load the first sector. - */ - hdr->root_flags = 1; - hdr->vid_mode = 0xffff; - hdr->boot_flag = 0xAA55; /* * The EFI firmware loader could have placed the kernel image @@ -978,16 +985,40 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) if (status != EFI_SUCCESS) { status = low_alloc(hdr->init_size, hdr->kernel_alignment, &start); - if (status != EFI_SUCCESS) { + if (status != EFI_SUCCESS) efi_printk("Failed to alloc mem for kernel\n"); - goto fail; - } } + if (status == EFI_SUCCESS) + memcpy((void *)start, (void *)(unsigned long)hdr->code32_start, + hdr->init_size); + + hdr->pref_address = hdr->code32_start; hdr->code32_start = (__u32)start; - hdr->pref_address = (__u64)(unsigned long)image->image_base; - memcpy((void *)start, image->image_base, image->image_size); + return status; +} + +/* + * On success we return a pointer to a boot_params structure, and NULL + * on failure. + */ +struct boot_params *efi_main(void *handle, efi_system_table_t *_table, + struct boot_params *boot_params) +{ + struct desc_ptr *gdt, *idt; + efi_loaded_image_t *image; + struct setup_header *hdr = &boot_params->hdr; + efi_status_t status; + struct desc_struct *desc; + + sys_table = _table; + + /* Check if we were booted by the EFI firmware */ + if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + goto fail; + + setup_graphics(boot_params); status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), @@ -1015,7 +1046,18 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) idt->size = 0; idt->address = 0; - status = make_boot_params(boot_params, image, handle); + /* + * If the kernel isn't already loaded at the preferred load + * address, relocate it. + */ + if (hdr->pref_address != hdr->code32_start) { + status = relocate_kernel(hdr); + + if (status != EFI_SUCCESS) + goto fail; + } + + status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) goto fail; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index c85e3ac99bb..aa4aaf1b238 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -42,6 +42,16 @@ ENTRY(startup_32) */ add $0x4, %esp + call make_boot_params + cmpl $0, %eax + je 1f + movl 0x4(%esp), %esi + movl (%esp), %ecx + pushl %eax + pushl %esi + pushl %ecx + + .org 0x30,0x90 call efi_main cmpl $0, %eax movl %eax, %esi diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 87e03a13d8e..2c4b171eec3 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -209,6 +209,16 @@ ENTRY(startup_64) .org 0x210 mov %rcx, %rdi mov %rdx, %rsi + pushq %rdi + pushq %rsi + call make_boot_params + cmpq $0,%rax + je 1f + mov %rax, %rdx + popq %rsi + popq %rdi + + .org 0x230,0x90 call efi_main movq %rax,%rsi cmpq $0,%rax diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8bbea6aa40d..097f4760fae 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -263,7 +263,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020a # header version number (>= 0x0105) + .word 0x020b # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -381,6 +381,8 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr #define INIT_SIZE VO_INIT_SIZE #endif init_size: .long INIT_SIZE # kernel initialization size +handover_offset: .long 0x30 # offset to the handover + # protocol entry point # End of setup header ##################################################### diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index eb45aa6b1f2..2ad874cb661 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -66,6 +66,7 @@ struct setup_header { __u64 setup_data; __u64 pref_address; __u32 init_size; + __u32 handover_offset; } __attribute__((packed)); struct sys_desc_table { -- cgit v1.2.3-70-g09d2 From 9f4e4392cbf72d731a489a3217fe810820b8ba96 Mon Sep 17 00:00:00 2001 From: Joe Millenbach Date: Thu, 19 Jul 2012 18:04:36 -0700 Subject: x86, boot: Removed quiet flag and switched quiet output to debug flag There are only 3 uses of the quiet flag and they all protect output that is only useful for debugging the stub, therefore we switched to using the debug flag for all extra output. Signed-off-by: Joe Millenbach Link: http://lkml.kernel.org/r/1342746282-28497-2-git-send-email-jmillenbach@gmail.com Signed-off-by: Gokul Caushik Reviewed-by: Josh Triplett Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7116dcba0c9..8f2355d5858 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -108,7 +108,6 @@ static void error(char *m); * This is set up by the setup-routine at boot-time */ struct boot_params *real_mode; /* Pointer to real-mode data */ -static int quiet; static int debug; void *memset(void *s, int c, size_t n); @@ -294,7 +293,7 @@ static void parse_elf(void *output) return; } - if (!quiet) + if (debug) putstr("Parsing ELF... "); phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); @@ -332,8 +331,6 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; - if (cmdline_find_option_bool("quiet")) - quiet = 1; if (cmdline_find_option_bool("debug")) debug = 1; @@ -369,11 +366,11 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, error("Wrong destination address"); #endif - if (!quiet) + if (debug) putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); - if (!quiet) + if (debug) putstr("done.\nBooting the kernel.\n"); return; } -- cgit v1.2.3-70-g09d2 From e605a425975b073aafebbb2c09d3ae266be2fd3e Mon Sep 17 00:00:00 2001 From: Joe Millenbach Date: Thu, 19 Jul 2012 18:04:37 -0700 Subject: x86, boot: Wrap debug printing in a new debug_putstr function Change all instances of if (debug) putstr(...) to a new debug_putstr(...). This allows a future change to conditionally stub out debug_putstr to save space. Signed-off-by: Joe Millenbach Link: http://lkml.kernel.org/r/1342746282-28497-3-git-send-email-jmillenbach@gmail.com Signed-off-by: Gokul Caushik Reviewed-by: Josh Triplett Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8f2355d5858..49c6d5632ef 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -223,6 +223,12 @@ void __putstr(int error, const char *s) outb(0xff & (pos >> 1), vidport+1); } +static void debug_putstr(const char *s) +{ + if (debug) + putstr(s); +} + void *memset(void *s, int c, size_t n) { int i; @@ -293,8 +299,7 @@ static void parse_elf(void *output) return; } - if (debug) - putstr("Parsing ELF... "); + debug_putstr("Parsing ELF... "); phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); if (!phdrs) @@ -346,8 +351,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, cols = real_mode->screen_info.orig_video_cols; console_init(); - if (debug) - putstr("early console in decompress_kernel\n"); + debug_putstr("early console in decompress_kernel\n"); free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -366,11 +370,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, error("Wrong destination address"); #endif - if (debug) - putstr("\nDecompressing Linux... "); + debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); - if (debug) - putstr("done.\nBooting the kernel.\n"); + debug_putstr("done.\nBooting the kernel.\n"); return; } -- cgit v1.2.3-70-g09d2 From cb454fe10400566214ec690318a0167ff7f5b8ca Mon Sep 17 00:00:00 2001 From: Joe Millenbach Date: Thu, 19 Jul 2012 18:04:38 -0700 Subject: x86, boot: Changed error putstr path to match new debug_putstr format For consistency we changed the error output path to match the new debug path. Signed-off-by: Joe Millenbach Link: http://lkml.kernel.org/r/1342746282-28497-4-git-send-email-jmillenbach@gmail.com Signed-off-by: Gokul Caushik Reviewed-by: Josh Triplett Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 6 +++--- arch/x86/boot/compressed/misc.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 49c6d5632ef..de1d54d8bdd 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -270,9 +270,9 @@ void *memcpy(void *dest, const void *src, size_t n) static void error(char *x) { - __putstr(1, "\n\n"); - __putstr(1, x); - __putstr(1, "\n\n -- System halted"); + error_putstr("\n\n"); + error_putstr(x); + error_putstr("\n\n -- System halted"); while (1) asm("hlt"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3f19c81a620..4c1bfb69e0d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -26,6 +26,7 @@ extern struct boot_params *real_mode; /* Pointer to real-mode data */ void __putstr(int error, const char *s); #define putstr(__x) __putstr(0, __x) +#define error_putstr(__x) __putstr(1, __x) #define puts(__x) __putstr(0, __x) /* cmdline.c */ -- cgit v1.2.3-70-g09d2 From 7aac3015b533add3e85222f9fd2ab66216b38746 Mon Sep 17 00:00:00 2001 From: Joe Millenbach Date: Thu, 19 Jul 2012 18:04:39 -0700 Subject: x86, boot: Switch output functions from command-line flags to conditional compilation Changed putstr flagging from parameter to conditional compilation for puts, debug_putstr, and error_putstr. This allows for space savings since most configurations won't use this feature. Signed-off-by: Joe Millenbach Link: http://lkml.kernel.org/r/1342746282-28497-5-git-send-email-jmillenbach@gmail.com Signed-off-by: Gokul Caushik Reviewed-by: Josh Triplett Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 12 +----------- arch/x86/boot/compressed/misc.h | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index de1d54d8bdd..8c29f82b15e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,15 +169,11 @@ static void serial_putchar(int ch) outb(ch, early_serial_base + TXR); } -void __putstr(int error, const char *s) +void __putstr(const char *s) { int x, y, pos; char c; -#ifndef CONFIG_X86_VERBOSE_BOOTUP - if (!error) - return; -#endif if (early_serial_base) { const char *str = s; while (*str) { @@ -223,12 +219,6 @@ void __putstr(int error, const char *s) outb(0xff & (pos >> 1), vidport+1); } -static void debug_putstr(const char *s) -{ - if (debug) - putstr(s); -} - void *memset(void *s, int c, size_t n) { int i; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 4c1bfb69e0d..618e5c830f1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -24,10 +24,19 @@ /* misc.c */ extern struct boot_params *real_mode; /* Pointer to real-mode data */ -void __putstr(int error, const char *s); -#define putstr(__x) __putstr(0, __x) -#define error_putstr(__x) __putstr(1, __x) -#define puts(__x) __putstr(0, __x) +void __putstr(const char *s); +#define error_putstr(__x) __putstr(__x) + +#ifdef CONFIG_X86_VERBOSE_BOOTUP + +#define debug_putstr(__x) __putstr(__x) + +#else + +static inline void debug_putstr(const char *s) +{ } + +#endif /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); -- cgit v1.2.3-70-g09d2 From 641a1cebfe2f05fa1a48503d816fc70cf707d033 Mon Sep 17 00:00:00 2001 From: Joe Millenbach Date: Thu, 19 Jul 2012 18:04:40 -0700 Subject: x86, boot: Removed unused debug flag and set code As we're no longer using the flag we don't need to extract the value from the command line and store it. This is a step towards removing command line parameter code. Signed-off-by: Joe Millenbach Link: http://lkml.kernel.org/r/1342746282-28497-6-git-send-email-jmillenbach@gmail.com Signed-off-by: Gokul Caushik Reviewed-by: Josh Triplett Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8c29f82b15e..88f7ff6da40 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -108,7 +108,6 @@ static void error(char *m); * This is set up by the setup-routine at boot-time */ struct boot_params *real_mode; /* Pointer to real-mode data */ -static int debug; void *memset(void *s, int c, size_t n); void *memcpy(void *dest, const void *src, size_t n); @@ -326,9 +325,6 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; - if (cmdline_find_option_bool("debug")) - debug = 1; - if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; -- cgit v1.2.3-70-g09d2 From cec49df9d331feaa2fea3d24c07147c7659940d1 Mon Sep 17 00:00:00 2001 From: Joe Millenbach Date: Thu, 19 Jul 2012 18:04:41 -0700 Subject: x86, boot: Exclude early_serial_console.c if can't use it. Removes early_serial_console.c code if we don't have the config option that enables it (EARLY_PRINTK). When disabling this code, make early_serial_base a constant 0 to allow the compiler to optimize away the code that checks for early_serial_base. Signed-off-by: Joe Millenbach Link: http://lkml.kernel.org/r/1342746282-28497-7-git-send-email-jmillenbach@gmail.com Signed-off-by: Gokul Caushik Reviewed-by: Josh Triplett Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/early_serial_console.c | 4 ++++ arch/x86/boot/compressed/misc.h | 10 ++++++++++ 2 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c index 261e81fb958..d3d003cb548 100644 --- a/arch/x86/boot/compressed/early_serial_console.c +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -1,5 +1,9 @@ #include "misc.h" +#ifdef CONFIG_EARLY_PRINTK + int early_serial_base; #include "../early_serial_console.c" + +#endif diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 618e5c830f1..3ffee6e0c54 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -43,7 +43,17 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); /* early_serial_console.c */ +#ifdef CONFIG_EARLY_PRINTK + extern int early_serial_base; void console_init(void); +#else + +static const int early_serial_base; +static inline void console_init(void) +{ } + +#endif + #endif -- cgit v1.2.3-70-g09d2 From bd448d4d0a1bd88dc6fdc41217b2c25383fa8529 Mon Sep 17 00:00:00 2001 From: Gokul Caushik Date: Thu, 19 Jul 2012 18:04:42 -0700 Subject: x86, boot: Exclude cmdline.c if you can't use it CONFIG_EARLY_PRINTK is the only feature that might use command line parsing in the decompression stage. If it is disabled then we can exclude the related code to save space. This can result in an estimated space savings of 2240 bytes from the compressed kernel image. Signed-off-by: Joe Millenbach Link: http://lkml.kernel.org/r/1342746282-28497-8-git-send-email-jmillenbach@gmail.com Signed-off-by: Gokul Caushik Reviewed-by: Josh Triplett Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/cmdline.c | 4 ++++ arch/x86/boot/compressed/misc.h | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index cb62f786990..10f6b1178c6 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,5 +1,7 @@ #include "misc.h" +#ifdef CONFIG_EARLY_PRINTK + static unsigned long fs; static inline void set_fs(unsigned long seg) { @@ -19,3 +21,5 @@ int cmdline_find_option_bool(const char *option) { return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); } + +#endif diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3ffee6e0c54..0e6dc0ee0ee 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -38,18 +38,19 @@ static inline void debug_putstr(const char *s) #endif +#ifdef CONFIG_EARLY_PRINTK + /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); /* early_serial_console.c */ -#ifdef CONFIG_EARLY_PRINTK - extern int early_serial_base; void console_init(void); #else +/* early_serial_console.c */ static const int early_serial_base; static inline void console_init(void) { } -- cgit v1.2.3-70-g09d2 From 36d93d88a5396baa135f8bcde7b8501dfe3b8e53 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Jun 2012 16:25:19 +0200 Subject: Revert "x86/early_printk: Replace obsolete simple_strtoul() usage with kstrtoint()" This reverts commit fbd24153c48b8425b09c161a020483cd77da870e. This commit is subtly buggy: kstrto*int() can return an error but it's not checked in every path. simple_strtoul() on the other hand could not fail, so this patch subtly intruduces new failure modes. Signed-off-by: Shuah Khan Link: http://lkml.kernel.org/r/1338424803.3569.5.camel@lorien2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 5e4771266f1..9b9f18b4991 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -119,7 +119,7 @@ static __init void early_serial_init(char *s) unsigned char c; unsigned divisor; unsigned baud = DEFAULT_BAUD; - ssize_t ret; + char *e; if (*s == ',') ++s; @@ -127,14 +127,14 @@ static __init void early_serial_init(char *s) if (*s) { unsigned port; if (!strncmp(s, "0x", 2)) { - ret = kstrtoint(s, 16, &early_serial_base); + early_serial_base = simple_strtoul(s, &e, 16); } else { static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; - ret = kstrtouint(s, 10, &port); - if (ret || port > 1) + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) port = 0; early_serial_base = bases[port]; } @@ -149,8 +149,8 @@ static __init void early_serial_init(char *s) outb(0x3, early_serial_base + MCR); /* DTR + RTS */ if (*s) { - ret = kstrtouint(s, 0, &baud); - if (ret || baud == 0) + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) baud = DEFAULT_BAUD; } -- cgit v1.2.3-70-g09d2 From ee08d1284ea9235b29bd2d9b7493b4b4cf3da09c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jun 2012 15:24:45 +0200 Subject: sched/x86: Remove broken power estimation The x86 sched power implementation has been broken forever and gets in the way of other stuff, remove it. [ For archaeological interest, fixing this code would require dealing with the cross-cpu calling of these functions and more importantly, we need to filter idle time out of the a/m-perf stuff because the ratio will go down to 0 when idle, giving a 0 capacity which is not what we'd want. ] Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Link: http://lkml.kernel.org/r/1339594110.8980.38.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/sched.c | 55 -------------------------------------------- 2 files changed, 1 insertion(+), 56 deletions(-) delete mode 100644 arch/x86/kernel/cpu/sched.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2fdfd..c5981267a60 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += vmware.o hypervisor.o mshyperv.o obj-y += rdrand.o obj-y += match.o diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c deleted file mode 100644 index a640ae5ad20..00000000000 --- a/arch/x86/kernel/cpu/sched.c +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#ifdef CONFIG_SMP - -static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); - -static unsigned long scale_aperfmperf(void) -{ - struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); - unsigned long ratio, flags; - - local_irq_save(flags); - get_aperfmperf(&val); - local_irq_restore(flags); - - ratio = calc_aperfmperf_ratio(old, &val); - *old = val; - - return ratio; -} - -unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) -{ - /* - * do aperf/mperf on the cpu level because it includes things - * like turbo mode, which are relevant to full cores. - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return scale_aperfmperf(); - - /* - * maybe have something cpufreq here - */ - - return default_scale_freq_power(sd, cpu); -} - -unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) -{ - /* - * aperf/mperf already includes the smt gain - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return SCHED_LOAD_SCALE; - - return default_scale_smt_power(sd, cpu); -} - -#endif -- cgit v1.2.3-70-g09d2 From d6250a3f12edb3a86db9598ffeca3de8b4a219e9 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Jul 2012 16:28:19 +0100 Subject: x86, nops: Missing break resulting in incorrect selection on Intel The Intel case falls through into the generic case which then changes the values. For cases like the P6 it doesn't do the right thing so this seems to be a screwup. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/n/tip-lww2uirad4skzjlmrm0vru8o@git.kernel.org Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f075..73ef56c5a8b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -219,7 +219,7 @@ void __init arch_init_ideal_nops(void) ideal_nops = intel_nops; #endif } - + break; default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; -- cgit v1.2.3-70-g09d2 From aea218f3cbbcaac249b6b2c98930a00d6d931f1e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 26 Jul 2012 10:00:45 +0300 Subject: KVM: PIC: call ack notifiers for irqs that are dropped form irr After commit 242ec97c358256 PIT interrupts are no longer delivered after PIC reset. It happens because PIT injects interrupt only if previous one was acked, but since on PIC reset it is dropped from irr it will never be delivered and hence acknowledged. Fix that by calling ack notifier on PIC reset. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1df8fb9e1d5..e498b18f010 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -316,6 +316,11 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { + u8 edge_irr = s->irr & ~s->elcr; + int i; + bool found; + struct kvm_vcpu *vcpu; + s->init4 = val & 1; s->last_irr = 0; s->irr &= s->elcr; @@ -333,6 +338,18 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) if (val & 0x08) pr_pic_unimpl( "level sensitive irq not supported"); + + kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) + if (kvm_apic_accept_pic_intr(vcpu)) { + found = true; + break; + } + + + if (found) + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (edge_irr & (1 << irq)) + pic_clear_isr(s, irq); } else if (val & 0x08) { if (val & 0x04) s->poll = 1; -- cgit v1.2.3-70-g09d2 From 4f3f713fc78d966d81ad87d2f3587369f9b34ae6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 23 Jul 2012 14:23:30 +0800 Subject: perf/x86: Fix typo in format definition of uncore PCU filter The format definition of uncore PCU filter should be filter_band* instead of filter_brand*. Reported-by: Stephane Eranian Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1343024611-4692-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 19faffc6088..a5de59f1a3f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -33,10 +33,10 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); -DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7"); -DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23"); -DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); /* Sandy Bridge-EP uncore support */ static struct intel_uncore_type snbep_uncore_cbox; @@ -272,10 +272,10 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { &format_attr_thresh5.attr, &format_attr_occ_invert.attr, &format_attr_occ_edge.attr, - &format_attr_filter_brand0.attr, - &format_attr_filter_brand1.attr, - &format_attr_filter_brand2.attr, - &format_attr_filter_brand3.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, NULL, }; -- cgit v1.2.3-70-g09d2 From 254298c726b93bb8ed92774b4a209b479851fa6d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 5 Jul 2012 14:32:17 +0800 Subject: perf/x86: Add Intel Nehalem-EX uncore support The uncore subsystem in Nehalem-EX consists of 7 components (U-Box, C-Box, B-Box, S-Box, R-Box, M-Box and W-Box). This patch is large because the way to program these boxes is diverse. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FF534F1.3030307@intel.com [ Improved the code. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 1484 +++++++++++++++++++++---- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 203 +++- 2 files changed, 1455 insertions(+), 232 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index a5de59f1a3f..d9981701bdc 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -38,6 +38,77 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 count; + + rdmsrl(event->hw.event_base, count); + + return count; +} + +/* + * generic get constraint function for shared match/mask registers. + */ +static struct event_constraint * +uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + unsigned long flags; + bool ok = false; + + /* + * reg->alloc can be set due to existing state, so for fake box we + * need to ignore this, otherwise we might fail to allocate proper + * fake state for this extra reg constraint. + */ + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || + (er->config1 == reg1->config && er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (!uncore_box_is_fake(box)) + reg1->alloc = 1; + return NULL; + } + + return &constraint_empty; +} + +static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + /* + * Only put constraint if extra reg was actually allocated. Also + * takes care of event which do not use an extra shared reg. + * + * Also, if this is a fake box we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent box + * state either since it will be thrown out. + */ + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + /* Sandy Bridge-EP uncore support */ static struct intel_uncore_type snbep_uncore_cbox; static struct intel_uncore_type snbep_uncore_pcu; @@ -64,18 +135,15 @@ static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) pci_write_config_dword(pdev, box_ctl, config); } -static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, - struct perf_event *event) +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; - pci_write_config_dword(pdev, hwc->config_base, hwc->config | - SNBEP_PMON_CTL_EN); + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } -static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, - struct perf_event *event) +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; @@ -83,8 +151,7 @@ static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, pci_write_config_dword(pdev, hwc->config_base, hwc->config); } -static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, - struct perf_event *event) +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; @@ -92,14 +159,15 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + return count; } static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; - pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, - SNBEP_PMON_BOX_CTL_INT); + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); } static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) @@ -112,7 +180,6 @@ static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) rdmsrl(msr, config); config |= SNBEP_PMON_BOX_CTL_FRZ; wrmsrl(msr, config); - return; } } @@ -126,12 +193,10 @@ static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) rdmsrl(msr, config); config &= ~SNBEP_PMON_BOX_CTL_FRZ; wrmsrl(msr, config); - return; } } -static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, - struct perf_event *event) +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; @@ -150,68 +215,15 @@ static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, wrmsrl(hwc->config_base, hwc->config); } -static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 count; - - rdmsrl(hwc->event_base, count); - return count; -} - static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) { unsigned msr = uncore_msr_box_ctl(box); + if (msr) wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); } -static struct event_constraint * -snbep_uncore_get_constraint(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct intel_uncore_extra_reg *er; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - unsigned long flags; - bool ok = false; - - if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc)) - return NULL; - - er = &box->shared_regs[reg1->idx]; - raw_spin_lock_irqsave(&er->lock, flags); - if (!atomic_read(&er->ref) || er->config1 == reg1->config) { - atomic_inc(&er->ref); - er->config1 = reg1->config; - ok = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - if (ok) { - if (box->phys_id >= 0) - reg1->alloc = 1; - return NULL; - } - return &constraint_empty; -} - -static void snbep_uncore_put_constraint(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct intel_uncore_extra_reg *er; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - - if (box->phys_id < 0 || !reg1->alloc) - return; - - er = &box->shared_regs[reg1->idx]; - atomic_dec(&er->ref); - reg1->alloc = 0; -} - -static int snbep_uncore_hw_config(struct intel_uncore_box *box, - struct perf_event *event) +static int snbep_uncore_hw_config(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; @@ -221,14 +233,16 @@ static int snbep_uncore_hw_config(struct intel_uncore_box *box, SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; reg1->config = event->attr.config1 & SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; - } else if (box->pmu->type == &snbep_uncore_pcu) { - reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; - reg1->config = event->attr.config1 & - SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; } else { - return 0; + if (box->pmu->type == &snbep_uncore_pcu) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->config = event->attr.config1 & SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; + } else { + return 0; + } } reg1->idx = 0; + return 0; } @@ -320,9 +334,9 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = { .enable_box = snbep_uncore_msr_enable_box, .disable_event = snbep_uncore_msr_disable_event, .enable_event = snbep_uncore_msr_enable_event, - .read_counter = snbep_uncore_msr_read_counter, - .get_constraint = snbep_uncore_get_constraint, - .put_constraint = snbep_uncore_put_constraint, + .read_counter = uncore_msr_read_counter, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, .hw_config = snbep_uncore_hw_config, }; @@ -589,188 +603,1208 @@ static void snbep_pci2phy_map_init(void) /* get the Node ID mapping */ pci_read_config_dword(ubox_dev, 0x54, &config); /* - * every three bits in the Node ID mapping register maps - * to a particular node. + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + }; + return; +} +/* end of Sandy Bridge-EP uncore support */ + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + +/* Nehalem-EX uncore support */ +#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ + ((1ULL << (n)) - 1))) + +DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); +DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); +DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63"); +DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); + +static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); +} + +static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config &= ~((1ULL << uncore_num_counters(box)) - 1); + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config |= (1ULL << uncore_num_counters(box)) - 1; + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx >= UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); + else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + else + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +#define NHMEX_UNCORE_OPS_COMMON_INIT() \ + .init_box = nhmex_uncore_msr_init_box, \ + .disable_box = nhmex_uncore_msr_disable_box, \ + .enable_box = nhmex_uncore_msr_enable_box, \ + .disable_event = nhmex_uncore_msr_disable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops nhmex_uncore_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_uncore_msr_enable_event, +}; + +static struct attribute *nhmex_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_edge.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_ubox_format_group = { + .name = "format", + .attrs = nhmex_uncore_ubox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_ubox = { + .name = "ubox", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, + .perf_ctr = NHMEX_U_MSR_PMON_CTR, + .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_ubox_format_group +}; + +static struct attribute *nhmex_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_cbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_cbox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_cbox = { + .name = "cbox", + .num_counters = 6, + .num_boxes = 8, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, + .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_C_MSR_OFFSET, + .pair_ctr_ctl = 1, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static struct uncore_event_desc nhmex_uncore_wbox_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type nhmex_uncore_wbox = { + .name = "wbox", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_W_MSR_PMON_CNT0, + .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, + .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, + .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, + .pair_ctr_ctl = 1, + .event_descs = nhmex_uncore_wbox_events, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int ctr, ev_sel; + + ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> + NHMEX_B_PMON_CTR_SHIFT; + ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> + NHMEX_B_PMON_CTL_EV_SEL_SHIFT; + + /* events that do not use the match/mask registers */ + if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || + (ctr == 2 && ev_sel != 0x4) || ctr == 3) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_B0_MSR_MATCH; + else + reg1->reg = NHMEX_B1_MSR_MATCH; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, reg2->config); + } + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); +} + +/* + * The Bbox has 4 counters, but each counter monitors different events. + * Use bits 6-7 in the event config to select counter. + */ +static struct event_constraint nhmex_uncore_bbox_constraints[] = { + EVENT_CONSTRAINT(0 , 1, 0xc0), + EVENT_CONSTRAINT(0x40, 2, 0xc0), + EVENT_CONSTRAINT(0x80, 4, 0xc0), + EVENT_CONSTRAINT(0xc0, 8, 0xc0), + EVENT_CONSTRAINT_END, +}; + +static struct attribute *nhmex_uncore_bbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_counter.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_bbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_bbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_bbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_bbox_msr_enable_event, + .hw_config = nhmex_bbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_bbox = { + .name = "bbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_B0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, + .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_B_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .constraints = nhmex_uncore_bbox_constraints, + .ops = &nhmex_uncore_bbox_ops, + .format_group = &nhmex_uncore_bbox_format_group +}; + +static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) { + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + } else { + reg1->config = ~0ULL; + reg2->config = ~0ULL; + } + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_S0_MSR_MM_CFG; + else + reg1->reg = NHMEX_S1_MSR_MM_CFG; + + reg1->idx = 0; + + return 0; +} + +static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + wrmsrl(reg1->reg, 0); + if (reg1->config != ~0ULL || reg2->config != ~0ULL) { + wrmsrl(reg1->reg + 1, reg1->config); + wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); + } + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); +} + +static struct attribute *nhmex_uncore_sbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_mm_cfg.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_sbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_sbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_sbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_sbox_msr_enable_event, + .hw_config = nhmex_sbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_S0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_S_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .ops = &nhmex_uncore_sbox_ops, + .format_group = &nhmex_uncore_sbox_format_group +}; + +enum { + EXTRA_REG_NHMEX_M_FILTER, + EXTRA_REG_NHMEX_M_DSP, + EXTRA_REG_NHMEX_M_ISS, + EXTRA_REG_NHMEX_M_MAP, + EXTRA_REG_NHMEX_M_MSC_THR, + EXTRA_REG_NHMEX_M_PGT, + EXTRA_REG_NHMEX_M_PLD, + EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, +}; + +static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { + MBOX_INC_SEL_EXTAR_REG(0x0, DSP), + MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x9, ISS), + /* event 0xa uses two extra registers */ + MBOX_INC_SEL_EXTAR_REG(0xa, ISS), + MBOX_INC_SEL_EXTAR_REG(0xa, PLD), + MBOX_INC_SEL_EXTAR_REG(0xb, PLD), + /* events 0xd ~ 0x10 use the same extra register */ + MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x16, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), + EVENT_EXTRA_END +}; + +static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + bool ret = false; + u64 mask; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config == config) { + atomic_inc(&er->ref); + er->config = config; + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; + } + /* + * The ZDP_CTL_FVC MSR has 4 fields which are used to control + * events 0xd ~ 0x10. Besides these 4 fields, there are additional + * fields which are shared. + */ + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (WARN_ON_ONCE(idx >= 4)) + return false; + + /* mask of the shared fields */ + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + + raw_spin_lock_irqsave(&er->lock, flags); + /* add mask of the non-shared field if it's in use */ + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + + if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + er->config &= ~mask; + er->config |= (config & mask); + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; +} + +static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + atomic_dec(&er->ref); + return; + } + + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + atomic_sub(1 << (idx * 8), &er->ref); +} + +u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); + u64 config = reg1->config; + + /* get the non-shared control bits and shift them */ + idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (new_idx > orig_idx) { + idx = new_idx - orig_idx; + config <<= 3 * idx; + } else { + idx = orig_idx - new_idx; + config >>= 3 * idx; + } + + /* add the shared control bits back */ + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + if (modify) { + /* adjust the main event selector */ + if (new_idx > orig_idx) + hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + else + hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + reg1->config = config; + reg1->idx = ~0xff | new_idx; + } + return config; +} + +static struct event_constraint * +nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int i, idx[2], alloc = 0; + u64 config1 = reg1->config; + + idx[0] = __BITS_VALUE(reg1->idx, 0, 8); + idx[1] = __BITS_VALUE(reg1->idx, 1, 8); +again: + for (i = 0; i < 2; i++) { + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + idx[i] = 0xff; + + if (idx[i] == 0xff) + continue; + + if (!nhmex_mbox_get_shared_reg(box, idx[i], + __BITS_VALUE(config1, i, 32))) + goto fail; + alloc |= (0x1 << i); + } + + /* for the match/mask registers */ + if ((uncore_box_is_fake(box) || !reg2->alloc) && + !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) + goto fail; + + /* + * If it's a fake box -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + */ + if (!uncore_box_is_fake(box)) { + if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) + nhmex_mbox_alter_er(event, idx[0], true); + reg1->alloc |= alloc; + reg2->alloc = 1; + } + return NULL; +fail: + if (idx[0] != 0xff && !(alloc & 0x1) && + idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + /* + * events 0xd ~ 0x10 are functional identical, but are + * controlled by different fields in the ZDP_CTL_FVC + * register. If we failed to take one field, try the + * rest 3 choices. */ - for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { - pcibus_to_physid[bus] = i; - break; - } + BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); + idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + idx[0] = (idx[0] + 1) % 4; + idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { + config1 = nhmex_mbox_alter_er(event, idx[0], false); + goto again; } - }; - return; -} -/* end of Sandy Bridge-EP uncore support */ + } + if (alloc & 0x1) + nhmex_mbox_put_shared_reg(box, idx[0]); + if (alloc & 0x2) + nhmex_mbox_put_shared_reg(box, idx[1]); + return &constraint_empty; +} -/* Sandy Bridge uncore support */ -static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, - struct perf_event *event) +static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); - else - wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); + if (uncore_box_is_fake(box)) + return; + + if (reg1->alloc & 0x1) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); + if (reg1->alloc & 0x2) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); + reg1->alloc = 0; + + if (reg2->alloc) { + nhmex_mbox_put_shared_reg(box, reg2->idx); + reg2->alloc = 0; + } } -static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, - struct perf_event *event) +static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) { - wrmsrl(event->hw.config_base, 0); + if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return er->idx; + return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; } -static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box, - struct perf_event *event) +static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) { - u64 count; - rdmsrl(event->hw.event_base, count); - return count; + struct intel_uncore_type *type = box->pmu->type; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct extra_reg *er; + unsigned msr; + int reg_idx = 0; + + if (WARN_ON_ONCE(reg1->idx != -1)) + return -EINVAL; + /* + * The mbox events may require 2 extra MSRs at the most. But only + * the lower 32 bits in these MSRs are significant, so we can use + * config1 to pass two MSRs' config. + */ + for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) || + er->idx == __BITS_VALUE(reg1->idx, 1, 8)) + continue; + if (WARN_ON_ONCE(reg_idx >= 2)) + return -EINVAL; + + msr = er->msr + type->msr_offset * box->pmu->pmu_idx; + if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) + return -EINVAL; + + /* always use the 32~63 bits to pass the PLD config */ + if (er->idx == EXTRA_REG_NHMEX_M_PLD) + reg_idx = 1; + + reg1->idx &= ~(0xff << (reg_idx * 8)); + reg1->reg &= ~(0xffff << (reg_idx * 16)); + reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); + reg1->reg |= msr << (reg_idx * 16); + reg1->config = event->attr.config1; + reg_idx++; + } + /* use config2 to pass the filter config */ + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + + return 0; } -static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) { - if (box->pmu->pmu_idx == 0) { - wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, - SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); - } + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return box->shared_regs[idx].config; + + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + return config; } -static struct attribute *snb_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask5.attr, +static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx; + + idx = __BITS_VALUE(reg1->idx, 0, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), + nhmex_mbox_shared_reg_config(box, idx)); + idx = __BITS_VALUE(reg1->idx, 1, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), + nhmex_mbox_shared_reg_config(box, idx)); + + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } + + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); + +static struct attribute *nhmex_uncore_mbox_formats_attr[] = { + &format_attr_count_mode.attr, + &format_attr_storage_mode.attr, + &format_attr_wrap_mode.attr, + &format_attr_flag_mode.attr, + &format_attr_inc_sel.attr, + &format_attr_set_flag_sel.attr, + &format_attr_filter_cfg.attr, + &format_attr_filter_match.attr, + &format_attr_filter_mask.attr, + &format_attr_dsp.attr, + &format_attr_thr.attr, + &format_attr_fvc.attr, + &format_attr_pgt.attr, + &format_attr_map.attr, + &format_attr_iss.attr, + &format_attr_pld.attr, NULL, }; -static struct attribute_group snb_uncore_format_group = { - .name = "format", - .attrs = snb_uncore_formats_attr, +static struct attribute_group nhmex_uncore_mbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_mbox_formats_attr, }; -static struct intel_uncore_ops snb_uncore_msr_ops = { - .init_box = snb_uncore_msr_init_box, - .disable_event = snb_uncore_msr_disable_event, - .enable_event = snb_uncore_msr_enable_event, - .read_counter = snb_uncore_msr_read_counter, +static struct uncore_event_desc nhmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), + { /* end: all zeroes */ }, }; -static struct event_constraint snb_uncore_cbox_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x80, 0x1), - UNCORE_EVENT_CONSTRAINT(0x83, 0x1), - EVENT_CONSTRAINT_END +static struct intel_uncore_ops nhmex_uncore_mbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_mbox_msr_enable_event, + .hw_config = nhmex_mbox_hw_config, + .get_constraint = nhmex_mbox_get_constraint, + .put_constraint = nhmex_mbox_put_constraint, }; -static struct intel_uncore_type snb_uncore_cbox = { - .name = "cbox", - .num_counters = 2, - .num_boxes = 4, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, - .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, - .fixed_ctr = SNB_UNC_FIXED_CTR, - .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, - .single_fixed = 1, - .event_mask = SNB_UNC_RAW_EVENT_MASK, - .msr_offset = SNB_UNC_CBO_MSR_OFFSET, - .constraints = snb_uncore_cbox_constraints, - .ops = &snb_uncore_msr_ops, - .format_group = &snb_uncore_format_group, +static struct intel_uncore_type nhmex_uncore_mbox = { + .name = "mbox", + .num_counters = 6, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_M0_MSR_PMU_CTL0, + .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, + .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_M_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 8, + .event_descs = nhmex_uncore_mbox_events, + .ops = &nhmex_uncore_mbox_ops, + .format_group = &nhmex_uncore_mbox_format_group, }; -static struct intel_uncore_type *snb_msr_uncores[] = { - &snb_uncore_cbox, - NULL, -}; -/* end of Sandy Bridge uncore support */ +void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int port; -/* Nehalem uncore support */ -static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) + /* adjust the main event selector */ + if (reg1->idx % 2) { + reg1->idx--; + hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } else { + reg1->idx++; + hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } + + /* adjust address or config of extra register */ + port = reg1->idx / 6 + box->pmu->pmu_idx * 4; + switch (reg1->idx % 6) { + case 0: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); + break; + case 1: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); + break; + case 2: + /* the 8~15 bits to the 0~7 bits */ + reg1->config >>= 8; + break; + case 3: + /* the 0~7 bits to the 8~15 bits */ + reg1->config <<= 8; + break; + case 4: + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); + break; + case 5: + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); + break; + }; +} + +/* + * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. + * An event set consists of 6 events, the 3rd and 4th events in + * an event set use the same extra register. So an event set uses + * 5 extra registers. + */ +static struct event_constraint * +nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) { - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + struct intel_uncore_extra_reg *er; + unsigned long flags; + int idx, er_idx; + u64 config1; + bool ok = false; + + if (!uncore_box_is_fake(box) && reg1->alloc) + return NULL; + + idx = reg1->idx % 6; + config1 = reg1->config; +again: + er_idx = idx; + /* the 3rd and 4th events use the same extra register */ + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (idx < 2) { + if (!atomic_read(&er->ref) || er->config == reg1->config) { + atomic_inc(&er->ref); + er->config = reg1->config; + ok = true; + } + } else if (idx == 2 || idx == 3) { + /* + * these two events use different fields in a extra register, + * the 0~7 bits and the 8~15 bits respectively. + */ + u64 mask = 0xff << ((idx - 2) * 8); + if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || + !((er->config ^ config1) & mask)) { + atomic_add(1 << ((idx - 2) * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + } else { + if (!atomic_read(&er->ref) || + (er->config == (hwc->config >> 32) && + er->config1 == reg1->config && + er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config = (hwc->config >> 32); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + /* + * The Rbox events are always in pairs. The paired + * events are functional identical, but use different + * extra registers. If we failed to take an extra + * register, try the alternative. + */ + if (idx % 2) + idx--; + else + idx++; + if (idx != reg1->idx % 6) { + if (idx == 2) + config1 >>= 8; + else if (idx == 3) + config1 <<= 8; + goto again; + } + } else { + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx % 6) + nhmex_rbox_alter_er(box, event); + reg1->alloc = 1; + } + return NULL; + } + return &constraint_empty; } -static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) { - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, - NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + int idx, er_idx; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + if (idx == 2 || idx == 3) + atomic_sub(1 << ((idx - 2) * 8), &er->ref); + else + atomic_dec(&er->ref); + + reg1->alloc = 0; } -static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, - struct perf_event *event) +static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int port, idx; - if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); - else - wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); + idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> + NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + if (idx >= 0x18) + return -EINVAL; + + reg1->idx = idx; + reg1->config = event->attr.config1; + + port = idx / 6 + box->pmu->pmu_idx * 4; + idx %= 6; + switch (idx) { + case 0: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); + break; + case 1: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); + break; + case 2: + case 3: + reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port); + break; + case 4: + case 5: + if (idx == 4) + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); + else + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); + reg2->config = event->attr.config2; + hwc->config |= event->attr.config & (~0ULL << 32); + break; + }; + return 0; } -static struct attribute *nhm_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask8.attr, +static u64 nhmex_rbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + er = &box->shared_regs[idx]; + + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + + return config; +} + +static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx, er_idx; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + switch (idx) { + case 0: + case 1: + wrmsrl(reg1->reg, reg1->config); + break; + case 2: + case 3: + wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx)); + break; + case 4: + case 5: + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, hwc->config >> 32); + wrmsrl(reg1->reg + 2, reg2->config); + break; + }; + + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); +} + +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); +DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); + +static struct attribute *nhmex_uncore_rbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_xbr_mm_cfg.attr, + &format_attr_xbr_match.attr, + &format_attr_xbr_mask.attr, + &format_attr_qlx_cfg.attr, + &format_attr_iperf_cfg.attr, NULL, }; -static struct attribute_group nhm_uncore_format_group = { +static struct attribute_group nhmex_uncore_rbox_format_group = { .name = "format", - .attrs = nhm_uncore_formats_attr, + .attrs = nhmex_uncore_rbox_formats_attr, }; -static struct uncore_event_desc nhm_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), - INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), - INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), - INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), - INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), - INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), - INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), - INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), +static struct uncore_event_desc nhmex_uncore_rbox_events[] = { + INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), + INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), { /* end: all zeroes */ }, }; -static struct intel_uncore_ops nhm_uncore_msr_ops = { - .disable_box = nhm_uncore_msr_disable_box, - .enable_box = nhm_uncore_msr_enable_box, - .disable_event = snb_uncore_msr_disable_event, - .enable_event = nhm_uncore_msr_enable_event, - .read_counter = snb_uncore_msr_read_counter, +static struct intel_uncore_ops nhmex_uncore_rbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_rbox_msr_enable_event, + .hw_config = nhmex_rbox_hw_config, + .get_constraint = nhmex_rbox_get_constraint, + .put_constraint = nhmex_rbox_put_constraint, }; -static struct intel_uncore_type nhm_uncore = { - .name = "", - .num_counters = 8, - .num_boxes = 1, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .event_ctl = NHM_UNC_PERFEVTSEL0, - .perf_ctr = NHM_UNC_UNCORE_PMC0, - .fixed_ctr = NHM_UNC_FIXED_CTR, - .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, - .event_mask = NHM_UNC_RAW_EVENT_MASK, - .event_descs = nhm_uncore_events, - .ops = &nhm_uncore_msr_ops, - .format_group = &nhm_uncore_format_group, +static struct intel_uncore_type nhmex_uncore_rbox = { + .name = "rbox", + .num_counters = 8, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_R_MSR_PMON_CTL0, + .perf_ctr = NHMEX_R_MSR_PMON_CNT0, + .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_R_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 20, + .event_descs = nhmex_uncore_rbox_events, + .ops = &nhmex_uncore_rbox_ops, + .format_group = &nhmex_uncore_rbox_format_group }; -static struct intel_uncore_type *nhm_msr_uncores[] = { - &nhm_uncore, +static struct intel_uncore_type *nhmex_msr_uncores[] = { + &nhmex_uncore_ubox, + &nhmex_uncore_cbox, + &nhmex_uncore_bbox, + &nhmex_uncore_sbox, + &nhmex_uncore_mbox, + &nhmex_uncore_rbox, + &nhmex_uncore_wbox, NULL, }; -/* end of Nehalem uncore support */ +/* end of Nehalem-EX uncore support */ -static void uncore_assign_hw_event(struct intel_uncore_box *box, - struct perf_event *event, int idx) +static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) { struct hw_perf_event *hwc = &event->hw; @@ -787,8 +1821,7 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box, hwc->event_base = uncore_perf_ctr(box, hwc->idx); } -static void uncore_perf_event_update(struct intel_uncore_box *box, - struct perf_event *event) +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) { u64 prev_count, new_count, delta; int shift; @@ -858,14 +1891,12 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) box->hrtimer.function = uncore_pmu_hrtimer; } -struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, - int cpu) +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu) { struct intel_uncore_box *box; int i, size; - size = sizeof(*box) + type->num_shared_regs * - sizeof(struct intel_uncore_extra_reg); + size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); if (!box) @@ -915,12 +1946,11 @@ static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) * perf core schedules event on the basis of cpu, uncore events are * collected by one of the cpus inside a physical package. */ - return uncore_pmu_to_box(uncore_event_to_pmu(event), - smp_processor_id()); + return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); } -static int uncore_collect_events(struct intel_uncore_box *box, - struct perf_event *leader, bool dogrp) +static int +uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) { struct perf_event *event; int n, max_count; @@ -952,8 +1982,7 @@ static int uncore_collect_events(struct intel_uncore_box *box, } static struct event_constraint * -uncore_get_event_constraint(struct intel_uncore_box *box, - struct perf_event *event) +uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event) { struct intel_uncore_type *type = box->pmu->type; struct event_constraint *c; @@ -977,15 +2006,13 @@ uncore_get_event_constraint(struct intel_uncore_box *box, return &type->unconstrainted; } -static void uncore_put_event_constraint(struct intel_uncore_box *box, - struct perf_event *event) +static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event) { if (box->pmu->type->ops->put_constraint) box->pmu->type->ops->put_constraint(box, event); } -static int uncore_assign_events(struct intel_uncore_box *box, - int assign[], int n) +static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) { unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; @@ -1407,8 +2434,7 @@ static bool pcidrv_registered; /* * add a pci uncore device */ -static int __devinit uncore_pci_add(struct intel_uncore_type *type, - struct pci_dev *pdev) +static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) { struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; @@ -1485,6 +2511,7 @@ static int __devinit uncore_pci_probe(struct pci_dev *pdev, struct intel_uncore_type *type; type = (struct intel_uncore_type *)id->driver_data; + return uncore_pci_add(type, pdev); } @@ -1612,8 +2639,8 @@ static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) return 0; } -static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores, - int old_cpu, int new_cpu) +static void __cpuinit +uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; @@ -1694,8 +2721,8 @@ static void __cpuinit uncore_event_init_cpu(int cpu) uncore_change_context(pci_uncores, -1, cpu); } -static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) +static int + __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -1732,12 +2759,12 @@ static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, } static struct notifier_block uncore_cpu_nb __cpuinitdata = { - .notifier_call = uncore_cpu_notifier, + .notifier_call = uncore_cpu_notifier, /* * to migrate uncore events, our notifier should be executed * before perf core's notifier. */ - .priority = CPU_PRI_PERF + 1, + .priority = CPU_PRI_PERF + 1, }; static void __init uncore_cpu_setup(void *dummy) @@ -1767,6 +2794,9 @@ static int __init uncore_cpu_init(void) snbep_uncore_cbox.num_boxes = max_cores; msr_uncores = snbep_msr_uncores; break; + case 46: + msr_uncores = nhmex_msr_uncores; + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index b13e9ea81de..47b1776a858 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -5,8 +5,6 @@ #include "perf_event.h" #define UNCORE_PMU_NAME_LEN 32 -#define UNCORE_BOX_HASH_SIZE 8 - #define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) #define UNCORE_FIXED_EVENT 0xff @@ -158,6 +156,193 @@ #define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc #define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd +/* NHM-EX event control */ +#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff +#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 +#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) +#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) +#define NHMEX_PMON_CTL_PMI_EN (1 << 20) +#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) +#define NHMEX_PMON_CTL_INVERT (1 << 23) +#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 +#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_UMASK_MASK | \ + NHMEX_PMON_CTL_EDGE_DET | \ + NHMEX_PMON_CTL_INVERT | \ + NHMEX_PMON_CTL_TRESH_MASK) + +/* NHM-EX Ubox */ +#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define NHMEX_U_MSR_PMON_CTR 0xc11 +#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 + +#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) +#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e +#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) +#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) +#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) + +#define NHMEX_U_PMON_RAW_EVENT_MASK \ + (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_EDGE_DET) + +/* NHM-EX Cbox */ +#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 +#define NHMEX_C0_MSR_PMON_CTR0 0xd11 +#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 +#define NHMEX_C_MSR_OFFSET 0x20 + +/* NHM-EX Bbox */ +#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 +#define NHMEX_B0_MSR_PMON_CTR0 0xc31 +#define NHMEX_B0_MSR_PMON_CTL0 0xc30 +#define NHMEX_B_MSR_OFFSET 0x40 +#define NHMEX_B0_MSR_MATCH 0xe45 +#define NHMEX_B0_MSR_MASK 0xe46 +#define NHMEX_B1_MSR_MATCH 0xe4d +#define NHMEX_B1_MSR_MASK 0xe4e + +#define NHMEX_B_PMON_CTL_EN (1 << 0) +#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_B_PMON_CTR_SHIFT 6 +#define NHMEX_B_PMON_CTR_MASK \ + (0x3 << NHMEX_B_PMON_CTR_SHIFT) +#define NHMEX_B_PMON_RAW_EVENT_MASK \ + (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ + NHMEX_B_PMON_CTR_MASK) + +/* NHM-EX Sbox */ +#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 +#define NHMEX_S0_MSR_PMON_CTR0 0xc51 +#define NHMEX_S0_MSR_PMON_CTL0 0xc50 +#define NHMEX_S_MSR_OFFSET 0x80 +#define NHMEX_S0_MSR_MM_CFG 0xe48 +#define NHMEX_S0_MSR_MATCH 0xe49 +#define NHMEX_S0_MSR_MASK 0xe4a +#define NHMEX_S1_MSR_MM_CFG 0xe58 +#define NHMEX_S1_MSR_MATCH 0xe59 +#define NHMEX_S1_MSR_MASK 0xe5a + +#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) + +/* NHM-EX Mbox */ +#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 +#define NHMEX_M0_MSR_PMU_DSP 0xca5 +#define NHMEX_M0_MSR_PMU_ISS 0xca6 +#define NHMEX_M0_MSR_PMU_MAP 0xca7 +#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 +#define NHMEX_M0_MSR_PMU_PGT 0xca9 +#define NHMEX_M0_MSR_PMU_PLD 0xcaa +#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab +#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 +#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 +#define NHMEX_M_MSR_OFFSET 0x40 +#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 +#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c + +#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) +#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL +#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL +#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 + +#define NHMEX_M_PMON_CTL_EN (1 << 0) +#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) +#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 +#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 +#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) +#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) +#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 +#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ + (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ + (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) +#define NHMEX_M_PMON_RAW_EVENT_MASK \ + (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ + NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ + NHMEX_M_PMON_CTL_WRAP_MODE | \ + NHMEX_M_PMON_CTL_FLAG_MODE | \ + NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) + + +#define NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK 0x1f +#define NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK (0x7 << 5) +#define NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK (0x7 << 8) +#define NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR (1 << 23) +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK \ + (NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK | \ + NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK | \ + NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK | \ + NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR) +#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) + +/* + * use the 9~13 bits to select event If the 7th bit is not set, + * otherwise use the 19~21 bits to select event. + */ +#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_EXTAR_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) +#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_SET_FLAG_SEL_MASK, \ + (u64)-1, NHMEX_M_##r) + +/* NHM-EX Rbox */ +#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 +#define NHMEX_R_MSR_PMON_CTL0 0xe10 +#define NHMEX_R_MSR_PMON_CNT0 0xe11 +#define NHMEX_R_MSR_OFFSET 0x20 + +#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ + ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) +#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ + (((n) < 4 ? 0 : 0x10) + (n) * 4) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ + (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ + (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) + +#define NHMEX_R_PMON_CTL_EN (1 << 0) +#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) +#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK + +/* NHM-EX Wbox */ +#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 +#define NHMEX_W_MSR_PMON_CNT0 0xc90 +#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 +#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 +#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 + +#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) + struct intel_uncore_ops; struct intel_uncore_pmu; struct intel_uncore_box; @@ -178,6 +363,7 @@ struct intel_uncore_type { unsigned msr_offset; unsigned num_shared_regs:8; unsigned single_fixed:1; + unsigned pair_ctr_ctl:1; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -213,7 +399,7 @@ struct intel_uncore_pmu { struct intel_uncore_extra_reg { raw_spinlock_t lock; - u64 config1; + u64 config, config1, config2; atomic_t ref; }; @@ -323,14 +509,16 @@ unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) static inline unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) { - return idx + box->pmu->type->event_ctl + + return box->pmu->type->event_ctl + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + box->pmu->type->msr_offset * box->pmu->pmu_idx; } static inline unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) { - return idx + box->pmu->type->perf_ctr + + return box->pmu->type->perf_ctr + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + box->pmu->type->msr_offset * box->pmu->pmu_idx; } @@ -422,3 +610,8 @@ static inline void uncore_box_init(struct intel_uncore_box *box) box->pmu->type->ops->init_box(box); } } + +static inline bool uncore_box_is_fake(struct intel_uncore_box *box) +{ + return (box->phys_id < 0); +} -- cgit v1.2.3-70-g09d2 From 74e6543fdc4e7553f572f7898ade649a09d85049 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 17 Jul 2012 17:27:55 +0800 Subject: perf/x86: Fix LLC-* and node-* events on Intel SandyBridge LLC-* and node-* events require using the OFFCORE_RESPONSE events on SandyBridge, but the hw_cache_extra_regs is left uninitialized. This patch adds the missing extra register configure table for SandyBridge. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1342517275-2875-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 92 +++++++++++++++++++++++++++++++--- 1 file changed, 86 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7a8b9d0abca..382366977d4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -138,6 +138,84 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +#define SNB_DMND_DATA_RD (1ULL << 0) +#define SNB_DMND_RFO (1ULL << 1) +#define SNB_DMND_IFETCH (1ULL << 2) +#define SNB_DMND_WB (1ULL << 3) +#define SNB_PF_DATA_RD (1ULL << 4) +#define SNB_PF_RFO (1ULL << 5) +#define SNB_PF_IFETCH (1ULL << 6) +#define SNB_LLC_DATA_RD (1ULL << 7) +#define SNB_LLC_RFO (1ULL << 8) +#define SNB_LLC_IFETCH (1ULL << 9) +#define SNB_BUS_LOCKS (1ULL << 10) +#define SNB_STRM_ST (1ULL << 11) +#define SNB_OTHER (1ULL << 15) +#define SNB_RESP_ANY (1ULL << 16) +#define SNB_NO_SUPP (1ULL << 17) +#define SNB_LLC_HITM (1ULL << 18) +#define SNB_LLC_HITE (1ULL << 19) +#define SNB_LLC_HITS (1ULL << 20) +#define SNB_LLC_HITF (1ULL << 21) +#define SNB_LOCAL (1ULL << 22) +#define SNB_REMOTE (0xffULL << 23) +#define SNB_SNP_NONE (1ULL << 31) +#define SNB_SNP_NOT_NEEDED (1ULL << 32) +#define SNB_SNP_MISS (1ULL << 33) +#define SNB_NO_FWD (1ULL << 34) +#define SNB_SNP_FWD (1ULL << 35) +#define SNB_HITM (1ULL << 36) +#define SNB_NON_DRAM (1ULL << 37) + +#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD) +#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO) +#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \ + SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \ + SNB_HITM) + +#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY) +#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY) + +#define SNB_L3_ACCESS SNB_RESP_ANY +#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM) + +static __initconst const u64 snb_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE, + }, + }, +}; + static __initconst const u64 snb_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -235,16 +313,16 @@ static __initconst const u64 snb_hw_cache_event_ids }, [ C(NODE) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, }, }, @@ -1964,6 +2042,8 @@ __init int intel_pmu_init(void) case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_snb(); -- cgit v1.2.3-70-g09d2 From 597ed953d7db28528b4687e46388c1aa905c14bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Jul 2012 13:50:23 +0200 Subject: perf/x86: Make bitfield unsigned Fix: arch/x86/kernel/cpu/perf_event.h:377:43: sparse: dubious one-bit signed bitfield Cc: Borislav Petkov Reported-by: Fengguang Wu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-2jxkmktkppkclj1qe6qxd7ah@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index a15df4be151..821d53b696d 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -374,7 +374,7 @@ struct x86_pmu { /* * Intel DebugStore bits */ - int bts :1, + unsigned int bts :1, bts_active :1, pebs :1, pebs_active :1, -- cgit v1.2.3-70-g09d2 From c1ece48cf7ec07c6b3e093a4036b54bc6078f782 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 24 Jul 2012 10:44:10 +0800 Subject: perf/x86: Fix format definition of SNB-EP uncore QPI box The event control register of SNB-EP uncore QPI box has a one bit extension at bit position 21. Reported-by: Stephane Eranian Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1343097850-4348-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 22 +++++++++++++++++++++- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 4 ++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index d9981701bdc..7563fda9f03 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -18,6 +18,7 @@ static struct event_constraint constraint_empty = EVENT_CONSTRAINT(0, 0, 0); DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); @@ -293,6 +294,15 @@ static struct attribute *snbep_uncore_pcu_formats_attr[] = { NULL, }; +static struct attribute *snbep_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + static struct uncore_event_desc snbep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), @@ -328,6 +338,11 @@ static struct attribute_group snbep_uncore_pcu_format_group = { .attrs = snbep_uncore_pcu_formats_attr, }; +static struct attribute_group snbep_uncore_qpi_format_group = { + .name = "format", + .attrs = snbep_uncore_qpi_formats_attr, +}; + static struct intel_uncore_ops snbep_uncore_msr_ops = { .init_box = snbep_uncore_msr_init_box, .disable_box = snbep_uncore_msr_disable_box, @@ -499,8 +514,13 @@ static struct intel_uncore_type snbep_uncore_qpi = { .num_counters = 4, .num_boxes = 2, .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &snbep_uncore_pci_ops, .event_descs = snbep_uncore_qpi_events, - SNBEP_UNCORE_PCI_COMMON_INIT(), + .format_group = &snbep_uncore_qpi_format_group, }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 47b1776a858..f3851892e07 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -113,6 +113,10 @@ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) +#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + /* SNB-EP pci control register */ #define SNBEP_PCI_PMON_BOX_CTL 0xf4 #define SNBEP_PCI_PMON_CTL0 0xd8 -- cgit v1.2.3-70-g09d2 From 1d44b30f35a9873a65b320dd5300088fa995fd94 Mon Sep 17 00:00:00 2001 From: Tomoki Sekiyama Date: Thu, 26 Jul 2012 19:47:32 +0900 Subject: x86/ioapic: Fix NULL pointer dereference on CPU hotplug after disabling irqs In the current kernel, percpu variable `vector_irq' is not always cleared when a CPU is offlined. If the CPU that has the disabled irqs in vector_irq is hotplugged again, __setup_vector_irq() hits invalid irq vector and may crash. This bug can be reproduced as following; # echo 0 > /sys/devices/system/cpu/cpu7/online # modprobe -r some_driver_using_interrupts # vector_irq@cpu7 uncleared # echo 1 > /sys/devices/system/cpu/cpu7/online # kernel may crash To fix this problem, this patch clears vector_irq in __fixup_irqs() when the CPU is offlined. This also reverts commit f6175f5bfb4c, which partially fixes this bug by clearing vector in __clear_irq_vector(). But in environments with IOMMU IRQ remapper, it could fail because cfg->domain doesn't contain offlined CPUs. With this patch, the fix in __clear_irq_vector() can be reverted because every vector_irq is already cleared in __fixup_irqs() on offlined CPUs. Signed-off-by: Tomoki Sekiyama Acked-by: Suresh Siddha Cc: yrl.pp-manager.tt@hitachi.com Cc: Yinghai Lu Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/20120726104732.2889.19144.stgit@kvmdev Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/irq.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 406eee78468..a6c64aaddf9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1204,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) BUG_ON(!cfg->vector); vector = cfg->vector; - for_each_cpu(cpu, cfg->domain) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; @@ -1212,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; - for_each_cpu(cpu, cfg->old_domain) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 1f5f1d5d2a0..7ad683d7864 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -328,6 +328,7 @@ void fixup_irqs(void) chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } + __this_cpu_write(vector_irq[vector], -1); } } #endif -- cgit v1.2.3-70-g09d2 From 35d56ca9d401d9d0ac8d91e4db1485af5f38f6fd Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Tue, 17 Jul 2012 10:14:41 +0800 Subject: perf/x86: Fix missing struct before structure name When CONFIG_PERF_EVENTS disabled, there will have a compiliation error, because missing struct before structure name. Signed-off-by: Jovi Zhang Cc: Peter Zijlstra Cc: Jiri Kosina Link: http://lkml.kernel.org/r/CACV3sbKF%3DCX%2B2jWEWesfCA6rBoQ3wDM4-5ac9MuBtVbCtMRHdQ@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index c78f14a0df0..dab39350e51 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -234,7 +234,7 @@ extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); #else -static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { *nr = 0; return NULL; -- cgit v1.2.3-70-g09d2 From 736edce5f395b8309a61aa62c36c4356abc83219 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 19 Jul 2012 11:21:53 -0700 Subject: x86/mce: Move MCACOD defines from mce-severity.c to We will need some of these values in mce.c. Move them to the appropriate header file so they are available. Acked-by: Borislav Petkov Signed-off-by: Tony Luck Cc: Chen Gong Cc: Huang Ying Cc: Hidetoshi Seto Link: http://lkml.kernel.org/r/0ccfb1af5fe35e537b7cd8e4d448bf7d851dbfb9.1343078495.git.tony.luck@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 8 ++++++++ arch/x86/kernel/cpu/mcheck/mce-severity.c | 7 ------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 441520e4174..a3ac52b29cb 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -33,6 +33,14 @@ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ +#define MCACOD 0xffff /* MCA Error Code */ + +/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ +#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ +#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ +#define MCACOD_DATA 0x0134 /* Data Load */ +#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ /* MCi_MISC register defines */ #define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 413c2ced887..13017626f9a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -55,13 +55,6 @@ static struct severity { #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) -#define MCACOD 0xffff -/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ -#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ -#define MCACOD_SCRUBMSK 0xfff0 -#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ -#define MCACOD_DATA 0x0134 /* Data Load */ -#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ MCESEV( NO, "Invalid", -- cgit v1.2.3-70-g09d2 From 61b0fccd7f114573f973dfe25d864608822dc09e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 19 Jul 2012 11:28:46 -0700 Subject: x86/mce: Add quirk for instruction recovery on Sandy Bridge processors Sandy Bridge processors follow the SDM (Vol 3B, Table 15-20) and set both the RIPV and EIPV bits in the MCG_STATUS register to zero for machine checks during instruction fetch. This is more than a little counter-intuitive and means that Linux cannot recover from these errors. Rather than insert special case code at several places in mce.c and mce-severity.c, we pretend the EIPV bit was set for just this case early in processing the machine check. Acked-by: Borislav Petkov Signed-off-by: Tony Luck Cc: Chen Gong Cc: Huang Ying Cc: Hidetoshi Seto Link: http://lkml.kernel.org/r/180a06f3f357cf9f78259ae443a082b14a29535b.1343078495.git.tony.luck@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 43 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a5a5dc1ff1..8cf60e29790 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -105,6 +105,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { static DEFINE_PER_CPU(struct work_struct, mce_work); +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -652,14 +654,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { int i, ret = 0; for (i = 0; i < banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); - if (m->status & MCI_STATUS_VAL) + if (m->status & MCI_STATUS_VAL) { __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + } if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) ret = 1; } @@ -1042,7 +1048,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) *final = m; memset(valid_banks, 0, sizeof(valid_banks)); - no_way_out = mce_no_way_out(&m, &msg, valid_banks); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); @@ -1418,6 +1424,34 @@ static void __mcheck_cpu_init_generic(void) } } +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + /* Add per CPU specific workarounds here */ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { @@ -1515,6 +1549,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) mce_bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; } if (monarch_timeout < 0) monarch_timeout = 0; -- cgit v1.2.3-70-g09d2 From 41fb433b63cc4745f4918fdaf295763da5ca826c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 14 Jul 2012 18:43:08 +0200 Subject: arch/x86/kernel/kdebugfs.c: Ensure a consistent return value in error case Typically, the return value desired for the failure of a function with an integer return value is a negative integer. In these cases, the return value is sometimes a negative integer and sometimes 0, due to a subsequent initialization of the return variable within the loop. A simplified version of the semantic match that finds this problem is: (http://coccinelle.lip6.fr/) // @r exists@ identifier ret; position p; constant C; expression e1,e3,e4; statement S; @@ ret = -C ... when != ret = e3 when any if@p (...) S ... when any if (\(ret != 0\|ret < 0\|ret > 0\) || ...) { ... return ...; } ... when != ret = e3 when any *if@p (...) { ... when != ret = e4 return ret; } // Signed-off-by: Julia Lawall Link: http://lkml.kernel.org/r/1342284188-19176-7-git-send-email-Julia.Lawall@lip6.fr Signed-off-by: Ingo Molnar --- arch/x86/kernel/kdebugfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 1d5d31ea686..dc1404bf8e4 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -107,7 +107,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) { struct setup_data_node *node; struct setup_data *data; - int error = -ENOMEM; + int error; struct dentry *d; struct page *pg; u64 pa_data; @@ -121,8 +121,10 @@ static int __init create_setup_data_nodes(struct dentry *parent) while (pa_data) { node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) + if (!node) { + error = -ENOMEM; goto err_dir; + } pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); if (PageHighMem(pg)) { -- cgit v1.2.3-70-g09d2 From 7463449b8287162454d7e00bf7fd2c64f72c1dc8 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 30 Jul 2012 14:41:09 -0700 Subject: atomic64_test: simplify the #ifdef for atomic64_dec_if_positive() test Introduce CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE and use this instead of the multitude of #if defined() checks in atomic64_test.c Signed-off-by: Catalin Marinas Cc: Russell King Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/x86/Kconfig | 1 + lib/Kconfig | 3 +++ lib/atomic64_test.c | 5 ++--- 7 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fbdd8533c05..4172c3cea22 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -11,6 +11,7 @@ config ARM select RTC_LIB select SYS_SUPPORTS_APM_EMULATION select GENERIC_ATOMIC64 if (CPU_V6 || !CPU_32v6K || !AEABI) + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_OPROFILE if (HAVE_PERF_EVENTS) select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KGDB diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b3e10fdd389..5e238d03960 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -20,6 +20,7 @@ config MIPS select ARCH_BINFMT_ELF_RANDOMIZE_PIE select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select HAVE_GENERIC_HARDIRQS diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9a5d3cdc3e1..4f681b78dd8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -115,6 +115,7 @@ config PPC select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a39b4690c17..d0a5e92b6b9 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -85,6 +85,7 @@ config S390 select HAVE_ARCH_MUTEX_CPU_RELAX select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 select ARCH_SAVE_PAGE_KEYS if HIBERNATION + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_CMPXCHG_LOCAL diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c70684f859e..ca4fdefe79e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -70,6 +70,7 @@ config X86 select HAVE_ARCH_JUMP_LABEL select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT select GENERIC_IRQ_PROBE diff --git a/lib/Kconfig b/lib/Kconfig index 8269d56dcda..72c1d469306 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -340,6 +340,9 @@ config NLATTR config GENERIC_ATOMIC64 bool +config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + def_bool y if GENERIC_ATOMIC64 + config LRU_CACHE tristate diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index cb99b91c3a1..00bca223d1e 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -114,8 +114,7 @@ static __init int test_atomic64(void) r += one; BUG_ON(v.counter != r); -#if defined(CONFIG_X86) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || \ - defined(CONFIG_S390) || defined(_ASM_GENERIC_ATOMIC64_H) || defined(CONFIG_ARM) +#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE INIT(onestwos); BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1)); r -= one; @@ -129,7 +128,7 @@ static __init int test_atomic64(void) BUG_ON(atomic64_dec_if_positive(&v) != (-one - one)); BUG_ON(v.counter != r); #else -#warning Please implement atomic64_dec_if_positive for your architecture, and add it to the IF above +#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol #endif INIT(onestwos); -- cgit v1.2.3-70-g09d2 From 4ed940d4c34c21a1a356969a923f2815d608e0bf Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 30 Jul 2012 14:41:13 -0700 Subject: firmware_map: make firmware_map_add_early() argument consistent with firmware_map_add_hotplug() There are two ways to create /sys/firmware/memmap/X sysfs: - firmware_map_add_early When the system starts, it is calledd from e820_reserve_resources() - firmware_map_add_hotplug When the memory is hot plugged, it is called from add_memory() But these functions are called without unifying value of end argument as below: - end argument of firmware_map_add_early() : start + size - 1 - end argument of firmware_map_add_hogplug() : start + size The patch unifies them to "start + size". Even if applying the patch, /sys/firmware/memmap/X/end file content does not change. [akpm@linux-foundation.org: clarify comments] Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Dave Hansen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 2 +- drivers/firmware/memmap.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 41857970517..ed858e9e9a7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -944,7 +944,7 @@ void __init e820_reserve_resources(void) for (i = 0; i < e820_saved.nr_map; i++) { struct e820entry *entry = &e820_saved.map[i]; firmware_map_add_early(entry->addr, - entry->addr + entry->size - 1, + entry->addr + entry->size, e820_type_to_string(entry->type)); } } diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index adc07102a20..c1cdc923666 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -98,7 +98,7 @@ static LIST_HEAD(map_entries); /** * firmware_map_add_entry() - Does the real work to add a firmware memmap entry. * @start: Start of the memory range. - * @end: End of the memory range (inclusive). + * @end: End of the memory range (exclusive). * @type: Type of the memory range. * @entry: Pre-allocated (either kmalloc() or bootmem allocator), uninitialised * entry. @@ -113,7 +113,7 @@ static int firmware_map_add_entry(u64 start, u64 end, BUG_ON(start > end); entry->start = start; - entry->end = end; + entry->end = end - 1; entry->type = type; INIT_LIST_HEAD(&entry->list); kobject_init(&entry->kobj, &memmap_ktype); @@ -148,7 +148,7 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry) * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do * memory hotplug. * @start: Start of the memory range. - * @end: End of the memory range (inclusive). + * @end: End of the memory range (exclusive) * @type: Type of the memory range. * * Adds a firmware mapping entry. This function is for memory hotplug, it is @@ -175,7 +175,7 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type) /** * firmware_map_add_early() - Adds a firmware mapping entry. * @start: Start of the memory range. - * @end: End of the memory range (inclusive). + * @end: End of the memory range. * @type: Type of the memory range. * * Adds a firmware mapping entry. This function uses the bootmem allocator -- cgit v1.2.3-70-g09d2 From c1d7e01d7877a397655277a920aeaa3830ed9461 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 30 Jul 2012 14:42:46 -0700 Subject: ipc: use Kconfig options for __ARCH_WANT_[COMPAT_]IPC_PARSE_VERSION Rather than #define the options manually in the architecture code, add Kconfig options for them and select them there instead. This also allows us to select the compat IPC version parsing automatically for platforms using the old compat IPC interface. Reported-by: Andrew Morton Signed-off-by: Will Deacon Cc: Arnd Bergmann Cc: Chris Metcalf Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/alpha/Kconfig | 1 + arch/alpha/include/asm/unistd.h | 1 - arch/arm/Kconfig | 1 + arch/arm/include/asm/unistd.h | 1 - arch/avr32/Kconfig | 1 + arch/avr32/include/asm/unistd.h | 1 - arch/blackfin/Kconfig | 1 + arch/blackfin/include/asm/unistd.h | 1 - arch/cris/Kconfig | 1 + arch/cris/include/asm/unistd.h | 1 - arch/frv/Kconfig | 1 + arch/frv/include/asm/unistd.h | 1 - arch/h8300/Kconfig | 1 + arch/h8300/include/asm/unistd.h | 1 - arch/m32r/Kconfig | 1 + arch/m32r/include/asm/unistd.h | 1 - arch/m68k/Kconfig | 1 + arch/m68k/include/asm/unistd.h | 1 - arch/microblaze/Kconfig | 1 + arch/microblaze/include/asm/unistd.h | 1 - arch/mips/Kconfig | 1 + arch/mips/include/asm/unistd.h | 1 - arch/mn10300/Kconfig | 1 + arch/mn10300/include/asm/unistd.h | 1 - arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/unistd.h | 1 - arch/s390/Kconfig | 1 + arch/s390/include/asm/unistd.h | 1 - arch/sh/Kconfig | 2 ++ arch/sh/include/asm/unistd.h | 1 - arch/sparc/Kconfig | 1 + arch/sparc/include/asm/unistd.h | 1 - arch/x86/Kconfig | 1 + arch/x86/include/asm/unistd.h | 1 - include/linux/compat.h | 1 - ipc/compat.c | 2 +- ipc/util.c | 4 ++-- ipc/util.h | 2 +- 39 files changed, 29 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 8c3d957fa8e..72f2fa189cc 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -248,7 +248,14 @@ config HAVE_CMPXCHG_LOCAL config HAVE_CMPXCHG_DOUBLE bool +config ARCH_WANT_IPC_PARSE_VERSION + bool + +config ARCH_WANT_COMPAT_IPC_PARSE_VERSION + bool + config ARCH_WANT_OLD_COMPAT_IPC + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION bool config HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 3de74c9f961..d5b9b5e645c 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -14,6 +14,7 @@ config ALPHA select AUTO_IRQ_AFFINITY if SMP select GENERIC_IRQ_SHOW select ARCH_WANT_OPTIONAL_GPIOLIB + select ARCH_WANT_IPC_PARSE_VERSION select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_CMOS_UPDATE diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index d1f23b722df..633b23b0664 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -470,7 +470,6 @@ #define NR_SYSCALLS 504 -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_GETHOSTNAME diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4172c3cea22..5df11147be8 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -39,6 +39,7 @@ config ARM select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_IRQ_PROBE + select ARCH_WANT_IPC_PARSE_VERSION select HARDIRQS_SW_RESEND select CPU_PM if (SUSPEND || CPU_IDLE) select GENERIC_PCI_IOMAP diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 512cd147345..0cab47d4a83 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -446,7 +446,6 @@ #ifdef __KERNEL__ -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 71d38c76726..5ade51c8a87 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -12,6 +12,7 @@ config AVR32 select HARDIRQS_SW_RESEND select GENERIC_IRQ_SHOW select ARCH_HAVE_CUSTOM_GPIO_H + select ARCH_WANT_IPC_PARSE_VERSION select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS help diff --git a/arch/avr32/include/asm/unistd.h b/arch/avr32/include/asm/unistd.h index f714544e556..1358e366f4b 100644 --- a/arch/avr32/include/asm/unistd.h +++ b/arch/avr32/include/asm/unistd.h @@ -318,7 +318,6 @@ /* SMP stuff */ #define __IGNORE_getcpu -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 9b765107e15..fb9fe00e51a 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -33,6 +33,7 @@ config BLACKFIN select HAVE_PERF_EVENTS select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_WANT_OPTIONAL_GPIOLIB + select ARCH_WANT_IPC_PARSE_VERSION select HAVE_GENERIC_HARDIRQS select GENERIC_ATOMIC64 select GENERIC_IRQ_PROBE diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index 3287222cba3..5b2a0748d7d 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -434,7 +434,6 @@ #define __IGNORE_getcpu #ifdef __KERNEL__ -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index bb344650a14..e92215428a3 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -42,6 +42,7 @@ config CRIS select HAVE_IDE select GENERIC_ATOMIC64 select HAVE_GENERIC_HARDIRQS + select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_IRQ_SHOW select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32 diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index f921b8b0f97..51873a446f8 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -347,7 +347,6 @@ #include -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT #define __ARCH_WANT_STAT64 diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index a685910d2d5..971c0a19fac 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -9,6 +9,7 @@ config FRV select GENERIC_IRQ_SHOW select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES + select ARCH_WANT_IPC_PARSE_VERSION config ZONE_DMA bool diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index a569dff7cd5..67f23a311db 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -349,7 +349,6 @@ #define NR_syscalls 338 -#define __ARCH_WANT_IPC_PARSE_VERSION /* #define __ARCH_WANT_OLD_READDIR */ #define __ARCH_WANT_OLD_STAT #define __ARCH_WANT_STAT64 diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 56e890df505..5e8a0d9a09c 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -3,6 +3,7 @@ config H8300 default y select HAVE_IDE select HAVE_GENERIC_HARDIRQS + select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES diff --git a/arch/h8300/include/asm/unistd.h b/arch/h8300/include/asm/unistd.h index 718511303b4..5cd882801d7 100644 --- a/arch/h8300/include/asm/unistd.h +++ b/arch/h8300/include/asm/unistd.h @@ -331,7 +331,6 @@ #define NR_syscalls 321 -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT #define __ARCH_WANT_STAT64 diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index b638d5bfa14..49498bbb961 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -7,6 +7,7 @@ config M32R select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select ARCH_WANT_IPC_PARSE_VERSION select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/m32r/include/asm/unistd.h b/arch/m32r/include/asm/unistd.h index 3e1db561aac..d5e66a48078 100644 --- a/arch/m32r/include/asm/unistd.h +++ b/arch/m32r/include/asm/unistd.h @@ -336,7 +336,6 @@ #define NR_syscalls 326 -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 14712012826..0b0f8b8c4a2 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -10,6 +10,7 @@ config M68K select GENERIC_STRNCPY_FROM_USER if MMU select GENERIC_STRNLEN_USER if MMU select FPU if MMU + select ARCH_WANT_IPC_PARSE_VERSION select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE config RWSEM_GENERIC_SPINLOCK diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index ea0b502f845..045cfd6a9e3 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -357,7 +357,6 @@ #define NR_syscalls 347 -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT #define __ARCH_WANT_STAT64 diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 0bf44231aaf..ab9afcaa7f6 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -15,6 +15,7 @@ config MICROBLAZE select TRACING_SUPPORT select OF select OF_EARLY_FLATTREE + select ARCH_WANT_IPC_PARSE_VERSION select IRQ_DOMAIN select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index d20ffbc86be..6985e6e9d82 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -400,7 +400,6 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ -#define __ARCH_WANT_IPC_PARSE_VERSION /* #define __ARCH_WANT_OLD_READDIR */ /* #define __ARCH_WANT_OLD_STAT */ #define __ARCH_WANT_STAT64 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5e238d03960..2d56cd5af33 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -27,6 +27,7 @@ config MIPS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select HAVE_ARCH_JUMP_LABEL + select ARCH_WANT_IPC_PARSE_VERSION select IRQ_FORCED_THREADING select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index d8dad5340ea..bebbde01be9 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -1034,7 +1034,6 @@ #ifndef __ASSEMBLY__ #define __ARCH_OMIT_COMPAT_SYS_GETDENTS64 -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 687f9b4a2ed..5cfb086b390 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -3,6 +3,7 @@ config MN10300 select HAVE_OPROFILE select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW + select ARCH_WANT_IPC_PARSE_VERSION select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_KGDB select HAVE_NMI_WATCHDOG if MN10300_WD_TIMER diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index 9051f921cbc..866eb14749d 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -358,7 +358,6 @@ /* * specify the deprecated syscalls we want to support on this arch */ -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT #define __ARCH_WANT_STAT64 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4f681b78dd8..352f416269c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -121,6 +121,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 select HAVE_GENERIC_HARDIRQS + select ARCH_WANT_IPC_PARSE_VERSION select SPARSE_IRQ select IRQ_PER_CPU select IRQ_DOMAIN diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index d3d1b5efd7e..bd377a36861 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -389,7 +389,6 @@ #include #include -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d0a5e92b6b9..296cd32466d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -118,6 +118,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_CLOCKEVENTS diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 2e37157ba6a..6756e78f480 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -388,7 +388,6 @@ #define __IGNORE_recvmmsg #define __IGNORE_sendmmsg -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a24595d83ad..36f5141e804 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -21,6 +21,7 @@ config SUPERH select HAVE_KERNEL_LZMA select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO + select ARCH_WANT_IPC_PARSE_VERSION select HAVE_SYSCALL_TRACEPOINTS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_GENERIC_HARDIRQS @@ -50,6 +51,7 @@ config SUPERH32 select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE + select ARCH_WANT_IPC_PARSE_VERSION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_ARCH_KGDB select HAVE_HW_BREAKPOINT diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index e800a38c9f8..7bc67076baa 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -6,7 +6,6 @@ # endif # define __ARCH_WANT_SYS_RT_SIGSUSPEND -# define __ARCH_WANT_IPC_PARSE_VERSION # define __ARCH_WANT_OLD_READDIR # define __ARCH_WANT_OLD_STAT # define __ARCH_WANT_STAT64 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e74ff137762..67f1f6f5f4e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -27,6 +27,7 @@ config SPARC select HAVE_ARCH_JUMP_LABEL select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW + select ARCH_WANT_IPC_PARSE_VERSION select USE_GENERIC_SMP_HELPERS if SMP select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index c7cb0af0eb5..fb269346480 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -423,7 +423,6 @@ #endif #ifdef __KERNEL__ -#define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ca4fdefe79e..ba2657c4921 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -85,6 +85,7 @@ config X86 select GENERIC_IOMAP select DCACHE_WORD_ACCESS select GENERIC_SMP_IDLE_THREAD + select ARCH_WANT_IPC_PARSE_VERSION if X86_32 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 4437001d8e3..0d9776e9e2d 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -15,7 +15,6 @@ # ifdef CONFIG_X86_32 # include -# define __ARCH_WANT_IPC_PARSE_VERSION # define __ARCH_WANT_STAT64 # define __ARCH_WANT_SYS_IPC # define __ARCH_WANT_SYS_OLD_MMAP diff --git a/include/linux/compat.h b/include/linux/compat.h index f2b8fe20cc8..09b28b7369d 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -256,7 +256,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr); #ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC -#define __ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_sys_semctl(int first, int second, int third, void __user *uptr); long compat_sys_msgsnd(int first, int second, int third, void __user *uptr); long compat_sys_msgrcv(int first, int second, int msgtyp, int third, diff --git a/ipc/compat.c b/ipc/compat.c index 20f92b2f293..ad9518eb26e 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -118,7 +118,7 @@ extern int sem_ctls[]; static inline int compat_ipc_parse_version(int *cmd) { -#ifdef __ARCH_WANT_COMPAT_IPC_PARSE_VERSION +#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION int version = *cmd & IPC_64; /* this is tricky: architectures that have support for the old diff --git a/ipc/util.c b/ipc/util.c index 75261a31d48..eb07fd356f2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -804,7 +804,7 @@ out_up: return ERR_PTR(err); } -#ifdef __ARCH_WANT_IPC_PARSE_VERSION +#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /** @@ -826,7 +826,7 @@ int ipc_parse_version (int *cmd) } } -#endif /* __ARCH_WANT_IPC_PARSE_VERSION */ +#endif /* CONFIG_ARCH_WANT_IPC_PARSE_VERSION */ #ifdef CONFIG_PROC_FS struct ipc_proc_iter { diff --git a/ipc/util.h b/ipc/util.h index 6f5c20bedaa..850ef3e962c 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -130,7 +130,7 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); -#ifndef __ARCH_WANT_IPC_PARSE_VERSION +#ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /* On IA-64, we always use the "64-bit version" of the IPC structures. */ # define ipc_parse_version(cmd) IPC_64 #else -- cgit v1.2.3-70-g09d2 From 3b6961ba8c682cc71e51079017743c1b282fd259 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 26 Jul 2012 19:40:08 -0400 Subject: ACPI/x86: revert 'x86, acpi: Call acpi_enter_sleep_state via an asmlinkage C function from assembler' cd74257b974d6d26442c97891c4d05772748b177 patched up GTS/BFS -- a feature we want to remove. So revert it (by hand, due to conflict in sleep.h) to prepare for GTS/BFS removal. Signed-off-by: Len Brown Acked-by: Ingo Molnar Acked-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/acpi/sleep.c | 4 ---- arch/x86/kernel/acpi/sleep.h | 2 -- arch/x86/kernel/acpi/wakeup_32.S | 4 +++- arch/x86/kernel/acpi/wakeup_64.S | 4 +++- 4 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 95bf99de905..1b8e5a03d94 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -25,10 +25,6 @@ unsigned long acpi_realmode_flags; static char temp_stack[4096]; #endif -asmlinkage void acpi_enter_s3(void) -{ - acpi_enter_sleep_state(3, wake_sleep_flags); -} /** * acpi_suspend_lowlevel - save kernel state * diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 5653a5791ec..67f59f8c695 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -2,7 +2,6 @@ * Variables and functions used by the code in sleep.c */ -#include #include extern unsigned long saved_video_mode; @@ -11,7 +10,6 @@ extern long saved_magic; extern int wakeup_pmode_return; extern u8 wake_sleep_flags; -extern asmlinkage void acpi_enter_s3(void); extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 72610839f03..13ab720573e 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -74,7 +74,9 @@ restore_registers: ENTRY(do_suspend_lowlevel) call save_processor_state call save_registers - call acpi_enter_s3 + pushl $3 + call acpi_enter_sleep_state + addl $4, %esp # In case of S3 failure, we'll emerge here. Jump # to ret_point to recover diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 014d1d28c39..8ea5164cbd0 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -71,7 +71,9 @@ ENTRY(do_suspend_lowlevel) movq %rsi, saved_rsi addq $8, %rsp - call acpi_enter_s3 + movl $3, %edi + xorl %eax, %eax + call acpi_enter_sleep_state /* in case something went wrong, restore the machine status and go on */ jmp resume_point -- cgit v1.2.3-70-g09d2 From 7740dfc0363a70546ef25c0383aca252f95a91d2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 21 Jul 2012 10:53:46 +1000 Subject: perf/x86/intel/uncore: Make UNCORE_PMU_HRTIMER_INTERVAL 64-bit i386 allmodconfig: arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function 'uncore_pmu_hrtimer': arch/x86/kernel/cpu/perf_event_intel_uncore.c:728: warning: integer overflow in expression arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function 'uncore_pmu_start_hrtimer': arch/x86/kernel/cpu/perf_event_intel_uncore.c:735: warning: integer overflow in expression Signed-off-by: Andrew Morton Cc: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-h84qlqj02zrojmxxybzmy9hi@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index f3851892e07..c9e5dc56630 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -5,7 +5,7 @@ #include "perf_event.h" #define UNCORE_PMU_NAME_LEN 32 -#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) +#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) #define UNCORE_FIXED_EVENT 0xff #define UNCORE_PMC_IDX_MAX_GENERIC 8 -- cgit v1.2.3-70-g09d2 From d07bdfd322d307789f15b427dbcc39257665356f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Jul 2012 09:42:15 +0200 Subject: perf/x86: Fix USER/KERNEL tagging of samples properly Some PMUs don't provide a full register set for their sample, specifically 'advanced' PMUs like AMD IBS and Intel PEBS which provide 'better' than regular interrupt accuracy. In this case we use the interrupt regs as basis and over-write some fields (typically IP) with different information. The perf core however uses user_mode() to distinguish user/kernel samples, user_mode() relies on regs->cs. If the interrupt skid pushed us over a boundary the new IP might not be in the same domain as the interrupt. Commit ce5c1fe9a9e ("perf/x86: Fix USER/KERNEL tagging of samples") tried to fix this by making the perf core use kernel_ip(). This however is wrong (TM), as pointed out by Linus, since it doesn't allow for VM86 and non-zero based segments in IA32 mode. Therefore, provide a new helper to set the regs->ip field, set_linear_ip(), which massages the regs into a suitable state assuming the provided IP is in fact a linear address. Also modify perf_instruction_pointer() and perf_callchain_user() to deal with segments base offsets. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341910954.3462.102.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 11 ++-- arch/x86/kernel/cpu/perf_event.c | 89 +++++++++++++++++++++++++++---- arch/x86/kernel/cpu/perf_event.h | 20 +++++++ arch/x86/kernel/cpu/perf_event_amd_ibs.c | 4 +- arch/x86/kernel/cpu/perf_event_intel_ds.c | 7 +-- 5 files changed, 114 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index dab39350e51..cb4e43bce98 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -196,11 +196,16 @@ static inline u32 get_ibs_caps(void) { return 0; } extern void perf_events_lapic_init(void); /* - * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. - * This flag is otherwise unused and ABI specified to be 0, so nobody should - * care what we do with it. + * Abuse bits {3,5} of the cpu eflags register. These flags are otherwise + * unused and ABI specified to be 0, so nobody should care what we do with + * them. + * + * EXACT - the IP points to the exact instruction that triggered the + * event (HW bugs exempt). + * VM - original X86_VM_MASK; see set_linear_ip(). */ #define PERF_EFLAGS_EXACT (1UL << 3) +#define PERF_EFLAGS_VM (1UL << 5) struct pt_regs; extern unsigned long perf_instruction_pointer(struct pt_regs *regs); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 29557aa06dd..915b876edd1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include "perf_event.h" @@ -1738,6 +1740,29 @@ valid_user_frame(const void __user *fp, unsigned long size) return (__range_not_ok(fp, size, TASK_SIZE) == 0); } +static unsigned long get_segment_base(unsigned int segment) +{ + struct desc_struct *desc; + int idx = segment >> 3; + + if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { + if (idx > LDT_ENTRIES) + return 0; + + if (idx > current->active_mm->context.size) + return 0; + + desc = current->active_mm->context.ldt; + } else { + if (idx > GDT_ENTRIES) + return 0; + + desc = __this_cpu_ptr(&gdt_page.gdt[0]); + } + + return get_desc_base(desc + idx); +} + #ifdef CONFIG_COMPAT #include @@ -1746,13 +1771,17 @@ static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { /* 32-bit process in 64-bit kernel. */ + unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const void __user *fp; if (!test_thread_flag(TIF_IA32)) return 0; - fp = compat_ptr(regs->bp); + cs_base = get_segment_base(regs->cs); + ss_base = get_segment_base(regs->ss); + + fp = compat_ptr(ss_base + regs->bp); while (entry->nr < PERF_MAX_STACK_DEPTH) { unsigned long bytes; frame.next_frame = 0; @@ -1765,8 +1794,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (!valid_user_frame(fp, sizeof(frame))) break; - perf_callchain_store(entry, frame.return_address); - fp = compat_ptr(frame.next_frame); + perf_callchain_store(entry, cs_base + frame.return_address); + fp = compat_ptr(ss_base + frame.next_frame); } return 1; } @@ -1789,6 +1818,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; } + /* + * We don't know what to do with VM86 stacks.. ignore them for now. + */ + if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) + return; + fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); @@ -1816,16 +1851,50 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) } } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +/* + * Deal with code segment offsets for the various execution modes: + * + * VM86 - the good olde 16 bit days, where the linear address is + * 20 bits and we use regs->ip + 0x10 * regs->cs. + * + * IA32 - Where we need to look at GDT/LDT segment descriptor tables + * to figure out what the 32bit base address is. + * + * X32 - has TIF_X32 set, but is running in x86_64 + * + * X86_64 - CS,DS,SS,ES are all zero based. + */ +static unsigned long code_segment_base(struct pt_regs *regs) { - unsigned long ip; + /* + * If we are in VM86 mode, add the segment offset to convert to a + * linear address. + */ + if (regs->flags & X86_VM_MASK) + return 0x10 * regs->cs; + + /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ +#ifdef CONFIG_X86_32 + if (user_mode(regs) && regs->cs != __USER_CS) + return get_segment_base(regs->cs); +#else + if (test_thread_flag(TIF_IA32)) { + if (user_mode(regs) && regs->cs != __USER32_CS) + return get_segment_base(regs->cs); + } +#endif + return 0; +} +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - ip = perf_guest_cbs->get_guest_ip(); - else - ip = instruction_pointer(regs); + return perf_guest_cbs->get_guest_ip(); - return ip; + return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) @@ -1838,7 +1907,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs) else misc |= PERF_RECORD_MISC_GUEST_KERNEL; } else { - if (!kernel_ip(regs->ip)) + if (user_mode(regs)) misc |= PERF_RECORD_MISC_USER; else misc |= PERF_RECORD_MISC_KERNEL; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 821d53b696d..6605a81ba33 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -516,6 +516,26 @@ static inline bool kernel_ip(unsigned long ip) #endif } +/* + * Not all PMUs provide the right context information to place the reported IP + * into full context. Specifically segment registers are typically not + * supplied. + * + * Assuming the address is a linear address (it is for IBS), we fake the CS and + * vm86 mode using the known zero-based code segment and 'fix up' the registers + * to reflect this. + * + * Intel PEBS/LBR appear to typically provide the effective address, nothing + * much we can do about that but pray and treat it like a linear address. + */ +static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) +{ + regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS; + if (regs->flags & X86_VM_MASK) + regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK); + regs->ip = ip; +} + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index da9bcdcd985..7bfb5bec863 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -13,6 +13,8 @@ #include +#include "perf_event.h" + static u32 ibs_caps; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) @@ -536,7 +538,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { regs.flags &= ~PERF_EFLAGS_EXACT; } else { - instruction_pointer_set(®s, ibs_data.regs[1]); + set_linear_ip(®s, ibs_data.regs[1]); regs.flags |= PERF_EFLAGS_EXACT; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 629ae0b7ad9..e38d97bf425 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -499,7 +499,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) * We sampled a branch insn, rewind using the LBR stack */ if (ip == to) { - regs->ip = from; + set_linear_ip(regs, from); return 1; } @@ -529,7 +529,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) } while (to < ip); if (to == ip) { - regs->ip = old_to; + set_linear_ip(regs, old_to); return 1; } @@ -569,7 +569,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * A possible PERF_SAMPLE_REGS will have to transfer all regs. */ regs = *iregs; - regs.ip = pebs->ip; + regs.flags = pebs->flags; + set_linear_ip(®s, pebs->ip); regs.bp = pebs->bp; regs.sp = pebs->sp; -- cgit v1.2.3-70-g09d2 From 392a325c4351339cfbf182bb5a1444df1cf65dbb Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 10 Jul 2012 19:31:51 -0700 Subject: Platform: OLPC: add a stub to drivers/platform/ for the OLPC EC driver The OLPC EC driver has outgrown arch/x86/platform/. It's time to both share common code amongst different architectures, as well as move it out of arch/x86/. The XO-1.75 is ARM-based, and the EC driver shares a lot of code with the x86 code. Signed-off-by: Andres Salomon Acked-by: Paul Fox Reviewed-by: Thomas Gleixner --- arch/x86/include/asm/olpc.h | 19 +++---------------- arch/x86/platform/olpc/olpc.c | 4 ++-- drivers/platform/Makefile | 1 + drivers/platform/olpc/Makefile | 4 ++++ drivers/platform/olpc/olpc-ec.c | 16 ++++++++++++++++ include/linux/olpc-ec.h | 29 +++++++++++++++++++++++++++++ 6 files changed, 55 insertions(+), 18 deletions(-) create mode 100644 drivers/platform/olpc/Makefile create mode 100644 drivers/platform/olpc/olpc-ec.c create mode 100644 include/linux/olpc-ec.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 87bdbca72f9..513e9992771 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -4,6 +4,7 @@ #define _ASM_X86_OLPC_H #include +#include struct olpc_platform_t { int flags; @@ -102,22 +103,8 @@ extern int pci_olpc_init(void); /* EC related functions */ -extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, - unsigned char *outbuf, size_t outlen); - -/* EC commands */ - -#define EC_FIRMWARE_REV 0x08 -#define EC_WRITE_SCI_MASK 0x1b -#define EC_WAKE_UP_WLAN 0x24 -#define EC_WLAN_LEAVE_RESET 0x25 -#define EC_READ_EB_MODE 0x2a -#define EC_SET_SCI_INHIBIT 0x32 -#define EC_SET_SCI_INHIBIT_RELEASE 0x34 -#define EC_WLAN_ENTER_RESET 0x35 -#define EC_WRITE_EXT_SCI_MASK 0x38 -#define EC_SCI_QUERY 0x84 -#define EC_EXT_SCI_QUERY 0x85 +extern int olpc_ec_cmd_x86(unsigned char cmd, unsigned char *inbuf, + size_t inlen, unsigned char *outbuf, size_t outlen); /* SCI source values */ diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index a4bee53c2e5..796e199ac77 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -125,7 +125,7 @@ static int __wait_on_obf(unsigned int line, unsigned int port, int desired) * . Unfortunately, while * OpenFirmware's source is available, the EC's is not. */ -int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, +int olpc_ec_cmd_x86(unsigned char cmd, unsigned char *inbuf, size_t inlen, unsigned char *outbuf, size_t outlen) { unsigned long flags; @@ -201,7 +201,7 @@ err: spin_unlock_irqrestore(&ec_lock, flags); return ret; } -EXPORT_SYMBOL_GPL(olpc_ec_cmd); +EXPORT_SYMBOL_GPL(olpc_ec_cmd_x86); void olpc_ec_wakeup_set(u16 value) { diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile index 782953ae4c0..b17c16ce54a 100644 --- a/drivers/platform/Makefile +++ b/drivers/platform/Makefile @@ -3,3 +3,4 @@ # obj-$(CONFIG_X86) += x86/ +obj-$(CONFIG_OLPC) += olpc/ diff --git a/drivers/platform/olpc/Makefile b/drivers/platform/olpc/Makefile new file mode 100644 index 00000000000..dc8b26bc720 --- /dev/null +++ b/drivers/platform/olpc/Makefile @@ -0,0 +1,4 @@ +# +# OLPC XO platform-specific drivers +# +obj-$(CONFIG_OLPC) += olpc-ec.o diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c new file mode 100644 index 00000000000..42026036cd3 --- /dev/null +++ b/drivers/platform/olpc/olpc-ec.c @@ -0,0 +1,16 @@ +/* + * Generic driver for the OLPC Embedded Controller. + * + * Copyright (C) 2011-2012 One Laptop per Child Foundation. + * + * Licensed under the GPL v2 or later. + */ +#include +#include + +int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen) +{ + /* Currently a stub; this will be expanded upon later. */ + return olpc_ec_cmd_x86(cmd, inbuf, inlen, outbuf, outlen); +} +EXPORT_SYMBOL_GPL(olpc_ec_cmd); diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h new file mode 100644 index 00000000000..6d4e426d9fd --- /dev/null +++ b/include/linux/olpc-ec.h @@ -0,0 +1,29 @@ +#ifndef _LINUX_OLPC_EC_H +#define _LINUX_OLPC_EC_H + +/* XO-1 EC commands */ +#define EC_FIRMWARE_REV 0x08 +#define EC_WRITE_SCI_MASK 0x1b +#define EC_WAKE_UP_WLAN 0x24 +#define EC_WLAN_LEAVE_RESET 0x25 +#define EC_READ_EB_MODE 0x2a +#define EC_SET_SCI_INHIBIT 0x32 +#define EC_SET_SCI_INHIBIT_RELEASE 0x34 +#define EC_WLAN_ENTER_RESET 0x35 +#define EC_WRITE_EXT_SCI_MASK 0x38 +#define EC_SCI_QUERY 0x84 +#define EC_EXT_SCI_QUERY 0x85 + +#ifdef CONFIG_OLPC + +extern int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, + size_t outlen); + +#else + +static inline int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, + size_t outlen) { return -ENODEV; } + +#endif /* CONFIG_OLPC */ + +#endif /* _LINUX_OLPC_EC_H */ -- cgit v1.2.3-70-g09d2 From 3bf9428f220911795edde453a95f9509945004e5 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 11 Jul 2012 01:16:29 -0700 Subject: drivers: OLPC: update various drivers to include olpc-ec.h Switch over to using olpc-ec.h in multiple steps, so as not to break builds. This covers every driver that calls olpc_ec_cmd(). Signed-off-by: Andres Salomon Acked-by: Paul Fox Reviewed-by: Thomas Gleixner --- arch/x86/include/asm/olpc.h | 1 - arch/x86/platform/olpc/olpc-xo1-pm.c | 1 + arch/x86/platform/olpc/olpc-xo1-sci.c | 1 + arch/x86/platform/olpc/olpc-xo15-sci.c | 1 + arch/x86/platform/olpc/olpc.c | 1 + drivers/net/wireless/libertas/if_usb.c | 1 + drivers/platform/x86/xo1-rfkill.c | 3 +-- drivers/power/olpc_battery.c | 1 + drivers/staging/olpc_dcon/olpc_dcon.c | 1 + 9 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 513e9992771..5b28f3e6975 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -4,7 +4,6 @@ #define _ASM_X86_OLPC_H #include -#include struct olpc_platform_t { int flags; diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index 0ce8616c88a..8054b64ec4c 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 04b8c73659c..63d4aa40956 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 599be499fdf..2fdca25905a 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 796e199ac77..a3fa180c15c 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/wireless/libertas/if_usb.c b/drivers/net/wireless/libertas/if_usb.c index 55a77e41170..27980778d99 100644 --- a/drivers/net/wireless/libertas/if_usb.c +++ b/drivers/net/wireless/libertas/if_usb.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_OLPC #include diff --git a/drivers/platform/x86/xo1-rfkill.c b/drivers/platform/x86/xo1-rfkill.c index b57ad864148..1da13ed34b0 100644 --- a/drivers/platform/x86/xo1-rfkill.c +++ b/drivers/platform/x86/xo1-rfkill.c @@ -12,8 +12,7 @@ #include #include #include - -#include +#include static bool card_blocked; diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c index 7385092f9bc..eaf917dc30d 100644 --- a/drivers/power/olpc_battery.c +++ b/drivers/power/olpc_battery.c @@ -17,6 +17,7 @@ #include #include #include +#include #include diff --git a/drivers/staging/olpc_dcon/olpc_dcon.c b/drivers/staging/olpc_dcon/olpc_dcon.c index 992275c0d87..2c4bd746715 100644 --- a/drivers/staging/olpc_dcon/olpc_dcon.c +++ b/drivers/staging/olpc_dcon/olpc_dcon.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3-70-g09d2 From 85f90cf6ca569b19cee212844b543a7355b77163 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Thu, 12 Jul 2012 17:57:28 -0700 Subject: x86: OLPC: switch over to using new EC driver on x86 This uses the new EC driver framework in drivers/platform/olpc. The XO-1 and XO-1.5-specific code is still in arch/x86, but the generic stuff (including a new workqueue; no more running EC commands with IRQs disabled!) can be shared with other architectures. Signed-off-by: Andres Salomon Acked-by: Paul Fox Reviewed-by: Thomas Gleixner --- arch/x86/include/asm/olpc.h | 5 ---- arch/x86/platform/olpc/olpc.c | 53 +++++++++++++++++++++-------------------- drivers/platform/olpc/olpc-ec.c | 5 ---- 3 files changed, 27 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 5b28f3e6975..72f9adf6eca 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -100,11 +100,6 @@ extern void olpc_xo1_pm_wakeup_clear(u16 value); extern int pci_olpc_init(void); -/* EC related functions */ - -extern int olpc_ec_cmd_x86(unsigned char cmd, unsigned char *inbuf, - size_t inlen, unsigned char *outbuf, size_t outlen); - /* SCI source values */ #define EC_SCI_SRC_EMPTY 0x00 diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index a3fa180c15c..45900968fb8 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -32,8 +31,6 @@ struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); -static DEFINE_SPINLOCK(ec_lock); - /* debugfs interface to EC commands */ #define EC_MAX_CMD_ARGS (5 + 1) /* cmd byte + 5 args */ #define EC_MAX_CMD_REPLY (8) @@ -126,16 +123,13 @@ static int __wait_on_obf(unsigned int line, unsigned int port, int desired) * . Unfortunately, while * OpenFirmware's source is available, the EC's is not. */ -int olpc_ec_cmd_x86(unsigned char cmd, unsigned char *inbuf, size_t inlen, - unsigned char *outbuf, size_t outlen) +static int olpc_xo1_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, + size_t outlen, void *arg) { - unsigned long flags; int ret = -EIO; int i; int restarts = 0; - spin_lock_irqsave(&ec_lock, flags); - /* Clear OBF */ for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++) inb(0x68); @@ -199,10 +193,8 @@ restart: ret = 0; err: - spin_unlock_irqrestore(&ec_lock, flags); return ret; } -EXPORT_SYMBOL_GPL(olpc_ec_cmd_x86); void olpc_ec_wakeup_set(u16 value) { @@ -366,7 +358,7 @@ static void setup_debugfs(void) &ec_debugfs_genops); } -static int olpc_ec_suspend(void) +static int olpc_ec_suspend(struct platform_device *pdev) { return olpc_ec_mask_write(ec_wakeup_mask); } @@ -425,8 +417,28 @@ static int __init add_xo1_platform_devices(void) return 0; } -static struct syscore_ops olpc_syscore_ops = { +static int olpc_xo1_ec_probe(struct platform_device *pdev) +{ + /* get the EC revision */ + olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, + (unsigned char *) &olpc_platform_info.ecver, 1); + + /* EC version 0x5f adds support for wide SCI mask */ + if (olpc_platform_info.ecver >= 0x5f) + olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI; + + pr_info("OLPC board revision %s%X (EC=%x)\n", + ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", + olpc_platform_info.boardrev >> 4, + olpc_platform_info.ecver); + + return 0; +} + +static struct olpc_ec_driver ec_xo1_driver = { .suspend = olpc_ec_suspend, + .probe = olpc_xo1_ec_probe, + .ec_cmd = olpc_xo1_ec_cmd, }; static int __init olpc_init(void) @@ -436,16 +448,14 @@ static int __init olpc_init(void) if (!olpc_ofw_present() || !platform_detect()) return 0; - spin_lock_init(&ec_lock); + /* register the XO-1 and 1.5-specific EC handler */ + olpc_ec_driver_register(&ec_xo1_driver, NULL); + platform_device_register_simple("olpc-ec", -1, NULL, 0); /* assume B1 and above models always have a DCON */ if (olpc_board_at_least(olpc_board(0xb1))) olpc_platform_info.flags |= OLPC_F_DCON; - /* get the EC revision */ - olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, - (unsigned char *) &olpc_platform_info.ecver, 1); - #ifdef CONFIG_PCI_OLPC /* If the VSA exists let it emulate PCI, if not emulate in kernel. * XO-1 only. */ @@ -453,14 +463,6 @@ static int __init olpc_init(void) !cs5535_has_vsa2()) x86_init.pci.arch_init = pci_olpc_init; #endif - /* EC version 0x5f adds support for wide SCI mask */ - if (olpc_platform_info.ecver >= 0x5f) - olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI; - - printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", - ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", - olpc_platform_info.boardrev >> 4, - olpc_platform_info.ecver); if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */ r = add_xo1_platform_devices(); @@ -468,7 +470,6 @@ static int __init olpc_init(void) return r; } - register_syscore_ops(&olpc_syscore_ops); setup_debugfs(); return 0; diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c index cfba41fb04d..a3d32c2eeb1 100644 --- a/drivers/platform/olpc/olpc-ec.c +++ b/drivers/platform/olpc/olpc-ec.c @@ -113,11 +113,6 @@ int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen) struct olpc_ec_priv *ec = ec_priv; struct ec_cmd_desc desc; - /* XXX: this will be removed in later patches */ - /* Are we using old-style callers? */ - if (!ec_driver || !ec_driver->ec_cmd) - return olpc_ec_cmd_x86(cmd, inbuf, inlen, outbuf, outlen); - /* Ensure a driver and ec hook have been registered */ if (WARN_ON(!ec_driver || !ec_driver->ec_cmd)) return -ENODEV; -- cgit v1.2.3-70-g09d2 From 6cca83d498bda0999302079bd59786370590c5c2 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Thu, 12 Jul 2012 20:45:14 -0700 Subject: Platform: OLPC: move debugfs support from x86 EC driver There's nothing about the debugfs interface for the EC driver that is architecture-specific, so move it into the arch-independent driver. The code is mostly unchanged with the exception of renamed variables, coding style changes, and API updates. Signed-off-by: Andres Salomon Acked-by: Paul Fox Reviewed-by: Thomas Gleixner --- arch/x86/platform/olpc/olpc.c | 97 --------------------------------- drivers/platform/olpc/olpc-ec.c | 117 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 97 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 45900968fb8..ed41b437b37 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -31,15 +30,6 @@ struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); -/* debugfs interface to EC commands */ -#define EC_MAX_CMD_ARGS (5 + 1) /* cmd byte + 5 args */ -#define EC_MAX_CMD_REPLY (8) - -static struct dentry *ec_debugfs_dir; -static DEFINE_MUTEX(ec_debugfs_cmd_lock); -static unsigned char ec_debugfs_resp[EC_MAX_CMD_REPLY]; -static unsigned int ec_debugfs_resp_bytes; - /* EC event mask to be applied during suspend (defining wakeup sources). */ static u16 ec_wakeup_mask; @@ -273,91 +263,6 @@ int olpc_ec_sci_query(u16 *sci_value) } EXPORT_SYMBOL_GPL(olpc_ec_sci_query); -static ssize_t ec_debugfs_cmd_write(struct file *file, const char __user *buf, - size_t size, loff_t *ppos) -{ - int i, m; - unsigned char ec_cmd[EC_MAX_CMD_ARGS]; - unsigned int ec_cmd_int[EC_MAX_CMD_ARGS]; - char cmdbuf[64]; - int ec_cmd_bytes; - - mutex_lock(&ec_debugfs_cmd_lock); - - size = simple_write_to_buffer(cmdbuf, sizeof(cmdbuf), ppos, buf, size); - - m = sscanf(cmdbuf, "%x:%u %x %x %x %x %x", &ec_cmd_int[0], - &ec_debugfs_resp_bytes, - &ec_cmd_int[1], &ec_cmd_int[2], &ec_cmd_int[3], - &ec_cmd_int[4], &ec_cmd_int[5]); - if (m < 2 || ec_debugfs_resp_bytes > EC_MAX_CMD_REPLY) { - /* reset to prevent overflow on read */ - ec_debugfs_resp_bytes = 0; - - printk(KERN_DEBUG "olpc-ec: bad ec cmd: " - "cmd:response-count [arg1 [arg2 ...]]\n"); - size = -EINVAL; - goto out; - } - - /* convert scanf'd ints to char */ - ec_cmd_bytes = m - 2; - for (i = 0; i <= ec_cmd_bytes; i++) - ec_cmd[i] = ec_cmd_int[i]; - - printk(KERN_DEBUG "olpc-ec: debugfs cmd 0x%02x with %d args " - "%02x %02x %02x %02x %02x, want %d returns\n", - ec_cmd[0], ec_cmd_bytes, ec_cmd[1], ec_cmd[2], ec_cmd[3], - ec_cmd[4], ec_cmd[5], ec_debugfs_resp_bytes); - - olpc_ec_cmd(ec_cmd[0], (ec_cmd_bytes == 0) ? NULL : &ec_cmd[1], - ec_cmd_bytes, ec_debugfs_resp, ec_debugfs_resp_bytes); - - printk(KERN_DEBUG "olpc-ec: response " - "%02x %02x %02x %02x %02x %02x %02x %02x (%d bytes expected)\n", - ec_debugfs_resp[0], ec_debugfs_resp[1], ec_debugfs_resp[2], - ec_debugfs_resp[3], ec_debugfs_resp[4], ec_debugfs_resp[5], - ec_debugfs_resp[6], ec_debugfs_resp[7], ec_debugfs_resp_bytes); - -out: - mutex_unlock(&ec_debugfs_cmd_lock); - return size; -} - -static ssize_t ec_debugfs_cmd_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - unsigned int i, r; - char *rp; - char respbuf[64]; - - mutex_lock(&ec_debugfs_cmd_lock); - rp = respbuf; - rp += sprintf(rp, "%02x", ec_debugfs_resp[0]); - for (i = 1; i < ec_debugfs_resp_bytes; i++) - rp += sprintf(rp, ", %02x", ec_debugfs_resp[i]); - mutex_unlock(&ec_debugfs_cmd_lock); - rp += sprintf(rp, "\n"); - - r = rp - respbuf; - return simple_read_from_buffer(buf, size, ppos, respbuf, r); -} - -static const struct file_operations ec_debugfs_genops = { - .write = ec_debugfs_cmd_write, - .read = ec_debugfs_cmd_read, -}; - -static void setup_debugfs(void) -{ - ec_debugfs_dir = debugfs_create_dir("olpc-ec", 0); - if (ec_debugfs_dir == ERR_PTR(-ENODEV)) - return; - - debugfs_create_file("cmd", 0600, ec_debugfs_dir, NULL, - &ec_debugfs_genops); -} - static int olpc_ec_suspend(struct platform_device *pdev) { return olpc_ec_mask_write(ec_wakeup_mask); @@ -470,8 +375,6 @@ static int __init olpc_init(void) return r; } - setup_debugfs(); - return 0; } diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c index a3d32c2eeb1..1a15a79fff0 100644 --- a/drivers/platform/olpc/olpc-ec.c +++ b/drivers/platform/olpc/olpc-ec.c @@ -6,6 +6,7 @@ * Licensed under the GPL v2 or later. */ #include +#include #include #include #include @@ -31,6 +32,8 @@ struct ec_cmd_desc { struct olpc_ec_priv { struct olpc_ec_driver *drv; + struct dentry *dbgfs_dir; + /* * Running an EC command while suspending means we don't always finish * the command before the machine suspends. This means that the EC @@ -144,6 +147,114 @@ int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen) } EXPORT_SYMBOL_GPL(olpc_ec_cmd); +#ifdef CONFIG_DEBUG_FS + +/* + * debugfs support for "generic commands", to allow sending + * arbitrary EC commands from userspace. + */ + +#define EC_MAX_CMD_ARGS (5 + 1) /* cmd byte + 5 args */ +#define EC_MAX_CMD_REPLY (8) + +static DEFINE_MUTEX(ec_dbgfs_lock); +static unsigned char ec_dbgfs_resp[EC_MAX_CMD_REPLY]; +static unsigned int ec_dbgfs_resp_bytes; + +static ssize_t ec_dbgfs_cmd_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) +{ + int i, m; + unsigned char ec_cmd[EC_MAX_CMD_ARGS]; + unsigned int ec_cmd_int[EC_MAX_CMD_ARGS]; + char cmdbuf[64]; + int ec_cmd_bytes; + + mutex_lock(&ec_dbgfs_lock); + + size = simple_write_to_buffer(cmdbuf, sizeof(cmdbuf), ppos, buf, size); + + m = sscanf(cmdbuf, "%x:%u %x %x %x %x %x", &ec_cmd_int[0], + &ec_dbgfs_resp_bytes, &ec_cmd_int[1], &ec_cmd_int[2], + &ec_cmd_int[3], &ec_cmd_int[4], &ec_cmd_int[5]); + if (m < 2 || ec_dbgfs_resp_bytes > EC_MAX_CMD_REPLY) { + /* reset to prevent overflow on read */ + ec_dbgfs_resp_bytes = 0; + + pr_debug("olpc-ec: bad ec cmd: cmd:response-count [arg1 [arg2 ...]]\n"); + size = -EINVAL; + goto out; + } + + /* convert scanf'd ints to char */ + ec_cmd_bytes = m - 2; + for (i = 0; i <= ec_cmd_bytes; i++) + ec_cmd[i] = ec_cmd_int[i]; + + pr_debug("olpc-ec: debugfs cmd 0x%02x with %d args %02x %02x %02x %02x %02x, want %d returns\n", + ec_cmd[0], ec_cmd_bytes, ec_cmd[1], ec_cmd[2], + ec_cmd[3], ec_cmd[4], ec_cmd[5], ec_dbgfs_resp_bytes); + + olpc_ec_cmd(ec_cmd[0], (ec_cmd_bytes == 0) ? NULL : &ec_cmd[1], + ec_cmd_bytes, ec_dbgfs_resp, ec_dbgfs_resp_bytes); + + pr_debug("olpc-ec: response %02x %02x %02x %02x %02x %02x %02x %02x (%d bytes expected)\n", + ec_dbgfs_resp[0], ec_dbgfs_resp[1], ec_dbgfs_resp[2], + ec_dbgfs_resp[3], ec_dbgfs_resp[4], ec_dbgfs_resp[5], + ec_dbgfs_resp[6], ec_dbgfs_resp[7], + ec_dbgfs_resp_bytes); + +out: + mutex_unlock(&ec_dbgfs_lock); + return size; +} + +static ssize_t ec_dbgfs_cmd_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + unsigned int i, r; + char *rp; + char respbuf[64]; + + mutex_lock(&ec_dbgfs_lock); + rp = respbuf; + rp += sprintf(rp, "%02x", ec_dbgfs_resp[0]); + for (i = 1; i < ec_dbgfs_resp_bytes; i++) + rp += sprintf(rp, ", %02x", ec_dbgfs_resp[i]); + mutex_unlock(&ec_dbgfs_lock); + rp += sprintf(rp, "\n"); + + r = rp - respbuf; + return simple_read_from_buffer(buf, size, ppos, respbuf, r); +} + +static const struct file_operations ec_dbgfs_ops = { + .write = ec_dbgfs_cmd_write, + .read = ec_dbgfs_cmd_read, +}; + +static struct dentry *olpc_ec_setup_debugfs(void) +{ + struct dentry *dbgfs_dir; + + dbgfs_dir = debugfs_create_dir("olpc-ec", NULL); + if (IS_ERR_OR_NULL(dbgfs_dir)) + return NULL; + + debugfs_create_file("cmd", 0600, dbgfs_dir, NULL, &ec_dbgfs_ops); + + return dbgfs_dir; +} + +#else + +static struct dentry *olpc_ec_setup_debugfs(void) +{ + return NULL; +} + +#endif /* CONFIG_DEBUG_FS */ + static int olpc_ec_probe(struct platform_device *pdev) { struct olpc_ec_priv *ec; @@ -160,6 +271,12 @@ static int olpc_ec_probe(struct platform_device *pdev) platform_set_drvdata(pdev, ec); err = ec_driver->probe ? ec_driver->probe(pdev) : 0; + if (err) { + ec_priv = NULL; + kfree(ec); + } else { + ec->dbgfs_dir = olpc_ec_setup_debugfs(); + } return err; } -- cgit v1.2.3-70-g09d2 From 1fcfd08bd0704e1888bd73153e8d2ca3640e22f2 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 17 Jul 2012 01:26:10 -0700 Subject: x86: OLPC: move s/r-related EC cmds to EC driver The new EC driver calls platform-specific suspend and resume hooks; run XO-1-specific EC commands from there, rather than deep in s/r code. If we attempt to run EC commands after the new EC driver has suspended, it is refused by the ec->suspended checks. Signed-off-by: Andres Salomon Acked-by: Paul Fox Reviewed-by: Thomas Gleixner --- arch/x86/platform/olpc/olpc-xo1-pm.c | 15 ------------- arch/x86/platform/olpc/olpc.c | 43 ++++++++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index 8054b64ec4c..d75582d1aa5 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -52,16 +52,11 @@ EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_clear); static int xo1_power_state_enter(suspend_state_t pm_state) { unsigned long saved_sci_mask; - int r; /* Only STR is supported */ if (pm_state != PM_SUSPEND_MEM) return -EINVAL; - r = olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0); - if (r) - return r; - /* * Save SCI mask (this gets lost since PM1_EN is used as a mask for * wakeup events, which is not necessarily the same event set) @@ -77,16 +72,6 @@ static int xo1_power_state_enter(suspend_state_t pm_state) /* Restore SCI mask (using dword access to CS5536_PM1_EN) */ outl(saved_sci_mask, acpi_base + CS5536_PM1_STS); - /* Tell the EC to stop inhibiting SCIs */ - olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0); - - /* - * Tell the wireless module to restart USB communication. - * Must be done twice. - */ - olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0); - olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0); - return 0; } diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index ed41b437b37..27376081dde 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -263,11 +263,6 @@ int olpc_ec_sci_query(u16 *sci_value) } EXPORT_SYMBOL_GPL(olpc_ec_sci_query); -static int olpc_ec_suspend(struct platform_device *pdev) -{ - return olpc_ec_mask_write(ec_wakeup_mask); -} - static bool __init check_ofw_architecture(struct device_node *root) { const char *olpc_arch; @@ -339,9 +334,40 @@ static int olpc_xo1_ec_probe(struct platform_device *pdev) return 0; } +static int olpc_xo1_ec_suspend(struct platform_device *pdev) +{ + olpc_ec_mask_write(ec_wakeup_mask); + + /* + * Squelch SCIs while suspended. This is a fix for + * . + */ + return olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0); +} + +static int olpc_xo1_ec_resume(struct platform_device *pdev) +{ + /* Tell the EC to stop inhibiting SCIs */ + olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0); + + /* + * Tell the wireless module to restart USB communication. + * Must be done twice. + */ + olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0); + olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0); + + return 0; +} static struct olpc_ec_driver ec_xo1_driver = { - .suspend = olpc_ec_suspend, + .probe = olpc_xo1_ec_probe, + .suspend = olpc_xo1_ec_suspend, + .resume = olpc_xo1_ec_resume, + .ec_cmd = olpc_xo1_ec_cmd, +}; + +static struct olpc_ec_driver ec_xo1_5_driver = { .probe = olpc_xo1_ec_probe, .ec_cmd = olpc_xo1_ec_cmd, }; @@ -354,7 +380,10 @@ static int __init olpc_init(void) return 0; /* register the XO-1 and 1.5-specific EC handler */ - olpc_ec_driver_register(&ec_xo1_driver, NULL); + if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */ + olpc_ec_driver_register(&ec_xo1_driver, NULL); + else + olpc_ec_driver_register(&ec_xo1_5_driver, NULL); platform_device_register_simple("olpc-ec", -1, NULL, 0); /* assume B1 and above models always have a DCON */ -- cgit v1.2.3-70-g09d2 From 4b6486659a7defef82ea51b276024b3aa357fefc Mon Sep 17 00:00:00 2001 From: Bruce Rogers Date: Fri, 20 Jul 2012 10:44:24 -0600 Subject: KVM: x86: apply kvmclock offset to guest wall clock time When a guest migrates to a new host, the system time difference from the previous host is used in the updates to the kvmclock system time visible to the guest, resulting in a continuation of correct kvmclock based guest timekeeping. The wall clock component of the kvmclock provided time is currently not updated with this same time offset. Since the Linux guest caches the wall clock based time, this discrepency is not noticed until the guest is rebooted. After reboot the guest's time calculations are off. This patch adjusts the wall clock by the kvmclock_offset, resulting in correct guest time after a reboot. Cc: Zachary Amsden Signed-off-by: Bruce Rogers Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59b59508ff0..42bce48f692 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -925,6 +925,10 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) */ getboottime(&boot); + if (kvm->arch.kvmclock_offset) { + struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); + boot = timespec_sub(boot, ts); + } wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; wc.version = version; -- cgit v1.2.3-70-g09d2 From a3170d2ec25f841bee1b52487693ac1a2f191ba6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 May 2012 21:16:35 -0400 Subject: um: switch UPT_SET_RETURN_VALUE and regs_return_value to pt_regs Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/um/kernel/process.c | 2 +- arch/um/kernel/skas/syscall.c | 2 +- arch/x86/um/asm/ptrace.h | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index ccb9a9d283f..f19ca615246 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -193,7 +193,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, if (current->thread.forking) { memcpy(&p->thread.regs.regs, ®s->regs, sizeof(p->thread.regs.regs)); - UPT_SET_SYSCALL_RETURN(&p->thread.regs.regs, 0); + PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0); if (sp != 0) REGS_SP(p->thread.regs.regs.gp) = sp; diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 05fbeb480e0..800551a29db 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -34,7 +34,7 @@ void handle_syscall(struct uml_pt_regs *r) result = -ENOSYS; else result = EXECUTE_SYSCALL(syscall, regs); - UPT_SET_SYSCALL_RETURN(r, result); + PT_REGS_SET_SYSCALL_RETURN(regs, result); syscall_trace(r, 1); } diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index 950dfb7b841..e72cd0df5ba 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -30,10 +30,10 @@ #define profile_pc(regs) PT_REGS_IP(regs) #define UPT_RESTART_SYSCALL(r) (UPT_IP(r) -= 2) -#define UPT_SET_SYSCALL_RETURN(r, res) (UPT_AX(r) = (res)) +#define PT_REGS_SET_SYSCALL_RETURN(r, res) (PT_REGS_AX(r) = (res)) -static inline long regs_return_value(struct uml_pt_regs *regs) +static inline long regs_return_value(struct pt_regs *regs) { - return UPT_AX(regs); + return PT_REGS_AX(regs); } #endif /* __UM_X86_PTRACE_H */ -- cgit v1.2.3-70-g09d2 From eaf4ce6c5fed6b4c55f7efcd5fc3477435cab5e9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Aug 2012 15:59:58 -0700 Subject: x86-64, kcmp: The kcmp system call can be common We already use the same system call handler for i386 and x86-64, there is absolutely no reason x32 can't use the same system call, too. Signed-off-by: H. Peter Anvin Cc: H.J. Lu Cc: Cyrill Gorcunov Cc: v3.5 Link: http://lkml.kernel.org/n/tip-vwzk3qbcr3yjyxjg2j38vgy9@git.kernel.org --- arch/x86/syscalls/syscall_64.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 51171aeff0d..29aed7ac2c0 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -318,7 +318,7 @@ 309 common getcpu sys_getcpu 310 64 process_vm_readv sys_process_vm_readv 311 64 process_vm_writev sys_process_vm_writev -312 64 kcmp sys_kcmp +312 common kcmp sys_kcmp # # x32-specific system call numbers start at 512 to avoid cache impact -- cgit v1.2.3-70-g09d2 From aa67f6096c19bcdb1951ef88be3cf3d2118809dc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Aug 2012 16:48:03 +0300 Subject: KVM: VMX: Fix ds/es corruption on i386 with preemption Commit b2da15ac26a0c ("KVM: VMX: Optimize %ds, %es reload") broke i386 in the following scenario: vcpu_load ... vmx_save_host_state vmx_vcpu_run (ds.rpl, es.rpl cleared by hardware) interrupt push ds, es # pushes bad ds, es schedule vmx_vcpu_put vmx_load_host_state reload ds, es (with __USER_DS) pop ds, es # of other thread's stack iret # other thread runs interrupt push ds, es schedule # back in vcpu thread pop ds, es # now with rpl=0 iret ... vcpu_put resume_userspace iret # clears ds, es due to mismatched rpl (instead of resume_userspace, we might return with SYSEXIT and then take an exception; when the exception IRETs we end up with cleared ds, es) Fix by avoiding the optimization on i386 and reloading ds, es on the lightweight exit path. Reported-by: Chris Clayron Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c39b60707e0..c00f03de1b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1488,13 +1488,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) loadsegment(ds, vmx->host_state.ds_sel); loadsegment(es, vmx->host_state.es_sel); } -#else - /* - * The sysexit path does not restore ds/es, so we must set them to - * a reasonable value ourselves. - */ - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); #endif reload_tss(); #ifdef CONFIG_X86_64 @@ -6370,6 +6363,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); +#ifndef CONFIG_X86_64 + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + * + * We can't defer this to vmx_load_host_state() since that function + * may be executed in interrupt context, which saves and restore segments + * around it, nullifying its effect. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) | (1 << VCPU_EXREG_CPL) -- cgit v1.2.3-70-g09d2 From 5bc6f9888db5739abfa0cae279b4b442e4db8049 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 30 Jul 2012 10:18:05 -0400 Subject: xen/p2m: Reserve 8MB of _brk space for P2M leafs when populating back. When we release pages back during bootup: Freeing 9d-100 pfn range: 99 pages freed Freeing 9cf36-9d0d2 pfn range: 412 pages freed Freeing 9f6bd-9f6bf pfn range: 2 pages freed Freeing 9f714-9f7bf pfn range: 171 pages freed Freeing 9f7e0-9f7ff pfn range: 31 pages freed Freeing 9f800-100000 pfn range: 395264 pages freed Released 395979 pages of unused memory We then try to populate those pages back. In the P2M tree however the space for those leafs must be reserved - as such we use extend_brk. We reserve 8MB of _brk space, which means we can fit over 1048576 PFNs - which is more than we should ever need. Without this, on certain compilation of the kernel we would hit: (XEN) domain_crash_sync called from entry.S (XEN) CPU: 0 (XEN) RIP: e033:[] (XEN) RFLAGS: 0000000000000206 EM: 1 CONTEXT: pv guest (XEN) rax: ffffffff81a7c000 rbx: 000000000000003d rcx: 0000000000001000 (XEN) rdx: ffffffff81a7b000 rsi: 0000000000001000 rdi: 0000000000001000 (XEN) rbp: ffffffff81801cd8 rsp: ffffffff81801c98 r8: 0000000000100000 (XEN) r9: ffffffff81a7a000 r10: 0000000000000001 r11: 0000000000000003 (XEN) r12: 0000000000000004 r13: 0000000000000004 r14: 000000000000003d (XEN) r15: 00000000000001e8 cr0: 000000008005003b cr4: 00000000000006f0 (XEN) cr3: 0000000125803000 cr2: 0000000000000000 (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e02b cs: e033 (XEN) Guest stack trace from rsp=ffffffff81801c98: .. which is extend_brk hitting a BUG_ON. Interestingly enough, most of the time we are not going to hit this b/c the _brk space is quite large (v3.5): ffffffff81a25000 B __brk_base ffffffff81e43000 B __brk_limit = ~4MB. vs earlier kernels (with this back-ported), the space is smaller: ffffffff81a25000 B __brk_base ffffffff81a7b000 B __brk_limit = 344 kBytes. where we would certainly hit this and hit extend_brk. Note that git commit c3d93f880197953f86ab90d9da4744e926b38e33 (xen: populate correct number of pages when across mem boundary (v2)) exposed this bug). [v1: Made it 8MB of _brk space instead of 4MB per Jan's suggestion] CC: stable@vger.kernel.org #only for 3.5 Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 64effdc6da9..b2e91d40a4c 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -194,6 +194,11 @@ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID * boundary violation will require three middle nodes. */ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); +/* When we populate back during bootup, the amount of pages can vary. The + * max we have is seen is 395979, but that does not mean it can't be more. + * But some machines can have 3GB I/O holes even. So lets reserve enough + * for 4GB of I/O and E820 holes. */ +RESERVE_BRK(p2m_populated, PMD_SIZE * 4); static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); -- cgit v1.2.3-70-g09d2 From 095adbb6441172985f5ddc3b9e88cb3191bdeac4 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 31 Jul 2012 17:41:09 +0200 Subject: ACPI: Only count valid srat memory structures Otherwise you could run into: WARN_ON in numa_register_memblks(), because node_possible_map is zero References: https://bugzilla.novell.com/show_bug.cgi?id=757888 On this machine (ProLiant ML570 G3) the SRAT table contains: - No processor affinities - One memory affinity structure (which is set disabled) CC: Per Jessen CC: Andi Kleen Signed-off-by: Thomas Renninger Signed-off-by: Len Brown --- arch/ia64/kernel/acpi.c | 5 +++-- arch/x86/mm/srat.c | 15 ++++++++------- drivers/acpi/numa.c | 8 +++++--- include/linux/acpi.h | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 6f38b6120d9..440578850ae 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -497,7 +497,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) srat_num_cpus++; } -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { unsigned long paddr, size; @@ -512,7 +512,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) /* Ignore disabled entries */ if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) - return; + return -1; /* record this node in proximity bitmap */ pxm_bit_set(pxm); @@ -531,6 +531,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) p->size = size; p->nid = pxm; num_node_memblks++; + return 0; } void __init acpi_numa_arch_fixup(void) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 4599c3e8bcb..4ddf497ca65 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -142,23 +142,23 @@ static inline int save_add_info(void) {return 0;} #endif /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; int node, pxm; if (srat_disabled()) - return; + return -1; if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { bad_srat(); - return; + return -1; } if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; + return -1; if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; + return -1; start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; @@ -168,12 +168,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); bad_srat(); - return; + return -1; } if (numa_add_memblk(node, start, end) < 0) { bad_srat(); - return; + return -1; } node_set(node, numa_nodes_parsed); @@ -181,6 +181,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); + return 0; } void __init acpi_numa_arch_fixup(void) {} diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 2a6399345c8..cb31298ca68 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -237,6 +237,8 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header, return 0; } +static int __initdata parsed_numa_memblks; + static int __init acpi_parse_memory_affinity(struct acpi_subtable_header * header, const unsigned long end) @@ -250,8 +252,8 @@ acpi_parse_memory_affinity(struct acpi_subtable_header * header, acpi_table_print_srat_entry(header); /* let architecture-dependent part to do it */ - acpi_numa_memory_affinity_init(memory_affinity); - + if (!acpi_numa_memory_affinity_init(memory_affinity)) + parsed_numa_memblks++; return 0; } @@ -306,7 +308,7 @@ int __init acpi_numa_init(void) if (cnt < 0) return cnt; - else if (cnt == 0) + else if (!parsed_numa_memblks) return -ENOENT; return 0; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3ad510b2528..4f2a7622450 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -96,7 +96,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); void acpi_numa_slit_init (struct acpi_table_slit *slit); void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); -void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); +int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); void acpi_numa_arch_fixup(void); #ifdef CONFIG_ACPI_HOTPLUG_CPU -- cgit v1.2.3-70-g09d2 From 439793d4b3c99e550daebd868bbd58967c93d0b3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 1 Aug 2012 17:01:42 +0300 Subject: KVM: x86: update KVM_SAVE_MSRS_BEGIN to correct value When MSR_KVM_PV_EOI_EN was added to msrs_to_save array KVM_SAVE_MSRS_BEGIN was not updated accordingly. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bce48f692..dce75b76031 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 9 +#define KVM_SAVE_MSRS_BEGIN 10 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, -- cgit v1.2.3-70-g09d2 From c6fd893da927c6cefb2ece22402765379921a834 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 31 Jul 2012 10:29:14 -0700 Subject: x86, avx: don't use avx instructions with "noxsave" boot param Clear AVX, AVX2 features along with clearing XSAVE feature bits, as part of the parsing "noxsave" parameter. Fixes the kernel boot panic with "noxsave" boot parameter. We could have checked cpu_has_osxsave along with cpu_has_avx etc, but Peter mentioned clearing the feature bits will be better for uses like static_cpu_has() etc. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1343755754.2041.2.camel@sbsiddha-desk.sc.intel.com Cc: # v3.5 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 46d8786d655..a5fbc3c5fcc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -144,6 +144,8 @@ static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); return 1; } __setup("noxsave", x86_xsave_setup); -- cgit v1.2.3-70-g09d2 From 484d90eec884d814b005c9736bcf3fd018acba65 Mon Sep 17 00:00:00 2001 From: Andrew Boie Date: Fri, 10 Aug 2012 11:49:06 -0700 Subject: x86, build: Globally set -fno-pic GCC built with nonstandard options can enable -fpic by default. We never want this for 32-bit kernels and it will break the build. [ hpa: Notably the Android toolchain apparently does this. ] Change-Id: Iaab7d66e598b1c65ac4a4f0229eca2cd3d0d2898 Signed-off-by: Andrew Boie Link: http://lkml.kernel.org/r/1344624546-29691-1-git-send-email-andrew.p.boie@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 4 ++++ arch/x86/boot/Makefile | 2 +- arch/x86/realmode/rm/Makefile | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b0c5276861e..682e9c210ba 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,6 +27,10 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return + # Never want PIC in a 32-bit kernel, prevent breakage with GCC built + # with nonstandard options + KBUILD_CFLAGS += -fno-pic + # prevent gcc from keeping the stack 16 byte aligned KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 5a747dd884d..f7535bedc33 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -57,7 +57,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ + -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ $(call cc-option, -ffreestanding) \ $(call cc-option, -fno-toplevel-reorder,\ $(call cc-option, -fno-unit-at-a-time)) \ diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index b2d534cab25..88692871823 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -72,7 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/../../boot/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ + -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ $(call cc-option, -ffreestanding) \ $(call cc-option, -fno-toplevel-reorder,\ $(call cc-option, -fno-unit-at-a-time)) \ -- cgit v1.2.3-70-g09d2 From cffa59baa5f1cf3e3e9e172697db48912471531c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Aug 2012 12:55:27 +0200 Subject: perf, x86: Fix uncore_types_exit section mismatch Fix the following section mismatch: WARNING: arch/x86/kernel/cpu/built-in.o(.text+0x7ad9): Section mismatch in reference from the function uncore_types_exit() to the function .init.text:uncore_type_exit() The function uncore_types_exit() references the function __init uncore_type_exit(). This is often because uncore_types_exit lacks a __init annotation or the annotation of uncore_type_exit is wrong. caused by 14371cce03c2 ("perf: Add generic PCI uncore PMU device support"). Cc: Zheng Yan Cc: Ingo Molnar Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-8-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 7563fda9f03..a7ccd68aa13 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2373,7 +2373,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->attr_groups[1] = NULL; } -static void uncore_types_exit(struct intel_uncore_type **types) +static void __init uncore_types_exit(struct intel_uncore_type **types) { int i; for (i = 0; types[i]; i++) -- cgit v1.2.3-70-g09d2 From ebb6cc03596cc89c89670473282ea46573feb34f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Aug 2012 13:11:21 +0800 Subject: perf/x86: Fixes for Nehalem-EX uncore driver This patch includes following fixes and update: - Only some events in the Sbox and Mbox can use the match/mask registers, add code to check this. - The format definitions for xbr_mm_cfg and xbr_match registers in the Rbox are wrong, xbr_mm_cfg should use 32 bits, xbr_match should use 64 bits. - Cleanup the Rbox code. Compute the addresses extra registers in the enable_event function instead of the hw_config function. This simplifies the code in nhmex_rbox_alter_er(). Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344229882-3907-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 195 ++++++++++++-------------- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 1 + 2 files changed, 87 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index a7ccd68aa13..84434e2a676 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -796,7 +796,6 @@ static struct intel_uncore_type *nhm_msr_uncores[] = { DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); -DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63"); DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); @@ -1032,24 +1031,22 @@ static struct intel_uncore_type nhmex_uncore_bbox = { static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) { - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) { - reg1->config = event->attr.config1; - reg2->config = event->attr.config2; - } else { - reg1->config = ~0ULL; - reg2->config = ~0ULL; - } + /* only TO_R_PROG_EV event uses the match/mask register */ + if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != + NHMEX_S_EVENT_TO_R_PROG_EV) + return 0; if (box->pmu->pmu_idx == 0) reg1->reg = NHMEX_S0_MSR_MM_CFG; else reg1->reg = NHMEX_S1_MSR_MM_CFG; - reg1->idx = 0; - + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; return 0; } @@ -1059,8 +1056,8 @@ static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event_extra *reg1 = &hwc->extra_reg; struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - wrmsrl(reg1->reg, 0); - if (reg1->config != ~0ULL || reg2->config != ~0ULL) { + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, 0); wrmsrl(reg1->reg + 1, reg1->config); wrmsrl(reg1->reg + 2, reg2->config); wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); @@ -1074,7 +1071,6 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = { &format_attr_edge.attr, &format_attr_inv.attr, &format_attr_thresh8.attr, - &format_attr_mm_cfg.attr, &format_attr_match.attr, &format_attr_mask.attr, NULL, @@ -1264,7 +1260,8 @@ again: } /* for the match/mask registers */ - if ((uncore_box_is_fake(box) || !reg2->alloc) && + if (reg2->idx != EXTRA_REG_NONE && + (uncore_box_is_fake(box) || !reg2->alloc) && !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) goto fail; @@ -1278,7 +1275,8 @@ again: if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) nhmex_mbox_alter_er(event, idx[0], true); reg1->alloc |= alloc; - reg2->alloc = 1; + if (reg2->idx != EXTRA_REG_NONE) + reg2->alloc = 1; } return NULL; fail: @@ -1342,9 +1340,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event struct extra_reg *er; unsigned msr; int reg_idx = 0; - - if (WARN_ON_ONCE(reg1->idx != -1)) - return -EINVAL; /* * The mbox events may require 2 extra MSRs at the most. But only * the lower 32 bits in these MSRs are significant, so we can use @@ -1355,11 +1350,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; - if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) || - er->idx == __BITS_VALUE(reg1->idx, 1, 8)) - continue; - if (WARN_ON_ONCE(reg_idx >= 2)) - return -EINVAL; msr = er->msr + type->msr_offset * box->pmu->pmu_idx; if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) @@ -1368,6 +1358,8 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event /* always use the 32~63 bits to pass the PLD config */ if (er->idx == EXTRA_REG_NHMEX_M_PLD) reg_idx = 1; + else if (WARN_ON_ONCE(reg_idx > 0)) + return -EINVAL; reg1->idx &= ~(0xff << (reg_idx * 8)); reg1->reg &= ~(0xffff << (reg_idx * 16)); @@ -1376,17 +1368,21 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event reg1->config = event->attr.config1; reg_idx++; } - /* use config2 to pass the filter config */ - reg2->idx = EXTRA_REG_NHMEX_M_FILTER; - if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) - reg2->config = event->attr.config2; - else - reg2->config = ~0ULL; - if (box->pmu->pmu_idx == 0) - reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; - else - reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; - + /* + * The mbox only provides ability to perform address matching + * for the PLD events. + */ + if (reg_idx == 2) { + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + } return 0; } @@ -1422,34 +1418,36 @@ static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct per wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), nhmex_mbox_shared_reg_config(box, idx)); - wrmsrl(reg2->reg, 0); - if (reg2->config != ~0ULL) { - wrmsrl(reg2->reg + 1, - reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); - wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & - (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); - wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + if (reg2->idx != EXTRA_REG_NONE) { + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } } wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); } -DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); -DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); -DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); -DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); -DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); -DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); -DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63"); -DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); -DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); -DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); static struct attribute *nhmex_uncore_mbox_formats_attr[] = { &format_attr_count_mode.attr, @@ -1458,7 +1456,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = { &format_attr_flag_mode.attr, &format_attr_inc_sel.attr, &format_attr_set_flag_sel.attr, - &format_attr_filter_cfg.attr, + &format_attr_filter_cfg_en.attr, &format_attr_filter_match.attr, &format_attr_filter_mask.attr, &format_attr_dsp.attr, @@ -1513,7 +1511,7 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) struct hw_perf_event_extra *reg1 = &hwc->extra_reg; int port; - /* adjust the main event selector */ + /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { reg1->idx--; hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; @@ -1522,29 +1520,17 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; } - /* adjust address or config of extra register */ + /* adjust extra register config */ port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { - case 0: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); - break; - case 1: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); - break; case 2: - /* the 8~15 bits to the 0~7 bits */ + /* shift the 8~15 bits to the 0~7 bits */ reg1->config >>= 8; break; case 3: - /* the 0~7 bits to the 8~15 bits */ + /* shift the 0~7 bits to the 8~15 bits */ reg1->config <<= 8; break; - case 4: - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); - break; - case 5: - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); - break; }; } @@ -1671,7 +1657,7 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - int port, idx; + int idx; idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> NHMEX_R_PMON_CTL_EV_SEL_SHIFT; @@ -1681,27 +1667,11 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event reg1->idx = idx; reg1->config = event->attr.config1; - port = idx / 6 + box->pmu->pmu_idx * 4; - idx %= 6; - switch (idx) { - case 0: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); - break; - case 1: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); - break; - case 2: - case 3: - reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port); - break; + switch (idx % 6) { case 4: case 5: - if (idx == 4) - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); - else - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); - reg2->config = event->attr.config2; hwc->config |= event->attr.config & (~0ULL << 32); + reg2->config = event->attr.config2; break; }; return 0; @@ -1727,28 +1697,34 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int idx, er_idx; + int idx, port; - idx = reg1->idx % 6; - er_idx = idx; - if (er_idx > 2) - er_idx--; - er_idx += (reg1->idx / 6) * 5; + idx = reg1->idx; + port = idx / 6 + box->pmu->pmu_idx * 4; - switch (idx) { + switch (idx % 6) { case 0: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); + break; case 1: - wrmsrl(reg1->reg, reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); break; case 2: case 3: - wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx)); + wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), + nhmex_rbox_shared_reg_config(box, 2 + (idx / 6) * 5)); break; case 4: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); + break; case 5: - wrmsrl(reg1->reg, reg1->config); - wrmsrl(reg1->reg + 1, hwc->config >> 32); - wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); break; }; @@ -1756,8 +1732,8 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); } -DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63"); -DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); @@ -2303,6 +2279,7 @@ int uncore_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_tag = ~0ULL; event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; if (event->attr.config == UNCORE_FIXED_EVENT) { /* no fixed counter */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index c9e5dc56630..8384e9b543b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -230,6 +230,7 @@ #define NHMEX_S1_MSR_MASK 0xe5a #define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) +#define NHMEX_S_EVENT_TO_R_PROG_EV 0 /* NHM-EX Mbox */ #define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 -- cgit v1.2.3-70-g09d2 From cb37af77124e8532e6ae3f9ca332593ba423b5f8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Aug 2012 13:11:22 +0800 Subject: perf/x86: Add Intel Westmere-EX uncore support The Westmere-EX uncore is similar to the Nehalem-EX uncore. The differences are: - Westmere-EX uncore has 10 instances of Cbox. The MSRs for Cbox8 and Cbox9 in the Westmere-EX aren't contiguous with Cbox 0~7. - The fvid field in the ZDP_CTL_FVC register in the Mbox is different. It's 5 bits in the Nehalem-EX, 6 bits in the Westmere-EX. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344229882-3907-3-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 56 ++++++++++++++++++++++----- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 45 ++++++++++----------- 2 files changed, 68 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 84434e2a676..0a5571080e7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -901,16 +901,21 @@ static struct attribute_group nhmex_uncore_cbox_format_group = { .attrs = nhmex_uncore_cbox_formats_attr, }; +/* msr offset for each instance of cbox */ +static unsigned nhmex_cbox_msr_offsets[] = { + 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, +}; + static struct intel_uncore_type nhmex_uncore_cbox = { .name = "cbox", .num_counters = 6, - .num_boxes = 8, + .num_boxes = 10, .perf_ctr_bits = 48, .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, .event_mask = NHMEX_PMON_RAW_EVENT_MASK, .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, - .msr_offset = NHMEX_C_MSR_OFFSET, + .msr_offsets = nhmex_cbox_msr_offsets, .pair_ctr_ctl = 1, .ops = &nhmex_uncore_ops, .format_group = &nhmex_uncore_cbox_format_group @@ -1138,6 +1143,9 @@ static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { EVENT_EXTRA_END }; +/* Nehalem-EX or Westmere-EX ? */ +bool uncore_nhmex; + static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) { struct intel_uncore_extra_reg *er; @@ -1167,18 +1175,29 @@ static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 return false; /* mask of the shared fields */ - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; raw_spin_lock_irqsave(&er->lock, flags); /* add mask of the non-shared field if it's in use */ - if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) - mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { + if (uncore_nhmex) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + } if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { atomic_add(1 << (idx * 8), &er->ref); - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | - NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | + WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); er->config &= ~mask; er->config |= (config & mask); ret = true; @@ -1212,7 +1231,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) /* get the non-shared control bits and shift them */ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (uncore_nhmex) + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); if (new_idx > orig_idx) { idx = new_idx - orig_idx; config <<= 3 * idx; @@ -1222,6 +1244,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) } /* add the shared control bits back */ + if (uncore_nhmex) + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + else + config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; if (modify) { /* adjust the main event selector */ @@ -1480,6 +1506,12 @@ static struct uncore_event_desc nhmex_uncore_mbox_events[] = { { /* end: all zeroes */ }, }; +static struct uncore_event_desc wsmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), + { /* end: all zeroes */ }, +}; + static struct intel_uncore_ops nhmex_uncore_mbox_ops = { NHMEX_UNCORE_OPS_COMMON_INIT(), .enable_event = nhmex_mbox_msr_enable_event, @@ -2791,7 +2823,13 @@ static int __init uncore_cpu_init(void) snbep_uncore_cbox.num_boxes = max_cores; msr_uncores = snbep_msr_uncores; break; - case 46: + case 46: /* Nehalem-EX */ + uncore_nhmex = true; + case 47: /* Westmere-EX aka. Xeon E7 */ + if (!uncore_nhmex) + nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; + if (nhmex_uncore_cbox.num_boxes > max_cores) + nhmex_uncore_cbox.num_boxes = max_cores; msr_uncores = nhmex_msr_uncores; break; default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 8384e9b543b..5b81c1856aa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -276,18 +276,12 @@ NHMEX_M_PMON_CTL_INC_SEL_MASK | \ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) - -#define NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK 0x1f -#define NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK (0x7 << 5) -#define NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK (0x7 << 8) -#define NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR (1 << 23) -#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK \ - (NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR) +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) #define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) +#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (12 + 3 * (n))) + /* * use the 9~13 bits to select event If the 7th bit is not set, * otherwise use the 19~21 bits to select event. @@ -369,6 +363,7 @@ struct intel_uncore_type { unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; + unsigned *msr_offsets; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -486,29 +481,31 @@ unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) return idx * 8 + box->pmu->type->perf_ctr; } -static inline -unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box) +{ + struct intel_uncore_pmu *pmu = box->pmu; + return pmu->type->msr_offsets ? + pmu->type->msr_offsets[pmu->pmu_idx] : + pmu->type->msr_offset * pmu->pmu_idx; +} + +static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) { if (!box->pmu->type->box_ctl) return 0; - return box->pmu->type->box_ctl + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->box_ctl + uncore_msr_box_offset(box); } -static inline -unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) { if (!box->pmu->type->fixed_ctl) return 0; - return box->pmu->type->fixed_ctl + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box); } -static inline -unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) { - return box->pmu->type->fixed_ctr + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); } static inline @@ -516,7 +513,7 @@ unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) { return box->pmu->type->event_ctl + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + uncore_msr_box_offset(box); } static inline @@ -524,7 +521,7 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) { return box->pmu->type->perf_ctr + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + uncore_msr_box_offset(box); } static inline -- cgit v1.2.3-70-g09d2 From 26a4f3c08de49c1437a7b7f97693cf22d8c31656 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Aug 2012 11:52:34 +0300 Subject: perf/x86: disable PEBS on a guest entry. If PMU counter has PEBS enabled it is not enough to disable counter on a guest entry since PEBS memory write can overshoot guest entry and corrupt guest memory. Disabling PEBS during guest entry solves the problem. Tested-by: David Ahern Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120809085234.GI3341@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 382366977d4..7f2739e03e7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1522,8 +1522,16 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + /* + * If PMU counter has PEBS enabled it is not enough to disable counter + * on a guest entry since PEBS memory write can overshoot guest entry + * and corrupt guest memory. Disabling PEBS solves the problem. + */ + arr[1].msr = MSR_IA32_PEBS_ENABLE; + arr[1].host = cpuc->pebs_enabled; + arr[1].guest = 0; - *nr = 1; + *nr = 2; return arr; } -- cgit v1.2.3-70-g09d2 From f1c6300183dbf5b9da25988e13f6f25a9e27151b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 8 Aug 2012 12:16:52 -0700 Subject: x86, apic: fix broken legacy interrupts in the logical apic mode Recent commit 332afa656e76458ee9cf0f0d123016a0658539e4 cleaned up a workaround that updates irq_cfg domain for legacy irq's that are handled by the IO-APIC. This was assuming that the recent changes in assign_irq_vector() were sufficient to remove the workaround. But this broke couple of AMD platforms. One of them seems to be sending interrupts to the offline cpu's, resulting in spurious "No irq handler for vector xx (irq -1)" messages when those cpu's come online. And the other platform seems to always send the interrupt to the last logical CPU (cpu-7). Recent changes had an unintended side effect of using only logical cpu-0 in the IO-APIC RTE (during boot for the legacy interrupts) and this broke the legacy interrupts not getting routed to the cpu-7 on the AMD platform, resulting in a boot hang. For now, reintroduce the removed workaround, (essentially not allowing the vector to change for legacy irq's when io-apic starts to handle the irq. Which also addressed the uninteded sife effect of just specifying cpu-0 in the IO-APIC RTE for those irq's during boot). Reported-and-tested-by: Robert Richter Reported-and-tested-by: Borislav Petkov Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1344453412.29170.5.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a6c64aaddf9..c265593ec2c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1356,6 +1356,16 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; + /* + * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC + * can handle this irq and the apic driver is finialized at this point, + * update the cfg->domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && + cpumask_equal(cfg->domain, cpumask_of(0))) + apic->vector_allocation_domain(0, cfg->domain, + apic->target_cpus()); + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.3-70-g09d2 From f026cfa82f628db24b8cea41b9d6202af104cecb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Aug 2012 09:53:38 -0700 Subject: Revert "x86-64/efi: Use EFI to deal with platform wall clock" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bacef661acdb634170a8faddbc1cf28e8f8b9eee. This commit has been found to cause serious regressions on a number of ASUS machines at the least. We probably need to provide a 1:1 map in addition to the EFI virtual memory map in order for this to work. Signed-off-by: H. Peter Anvin Reported-and-bisected-by: Jérôme Carretero Cc: Jan Beulich Cc: Matt Fleming Cc: Matthew Garrett Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120805172903.5f8bb24c@zougloub.eu --- arch/x86/mm/pageattr.c | 10 ++++------ arch/x86/platform/efi/efi.c | 30 ++++++++++++++++++++++++++---- include/linux/efi.h | 2 ++ init/main.c | 8 ++++---- 4 files changed, 36 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 931930a9616..a718e0d2350 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,13 +919,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it, in the - * error case, and during early boot (for EFI) we fall back - * to cpa_flush_all (which uses wbinvd): + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): */ - if (early_boot_irqs_disabled) - __cpa_flush_all((void *)(long)cache); - else if (!ret && cpu_has_clflush) { + if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 2dc29f51e75..92660edaa1e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -234,7 +234,22 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -static int efi_set_rtc_mmss(unsigned long nowtime) +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), + virt_to_phys(tc)); + efi_call_phys_epilog(); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes; efi_status_t status; @@ -263,7 +278,7 @@ static int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -static unsigned long efi_get_time(void) +unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; @@ -606,13 +621,18 @@ static int __init efi_runtime_init(void) } /* * We will only need *early* access to the following - * EFI runtime service before set_virtual_address_map + * two EFI runtime services before set_virtual_address_map * is invoked. */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) runtime->set_virtual_address_map; - + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; early_iounmap(runtime, sizeof(efi_runtime_services_t)); return 0; @@ -700,10 +720,12 @@ void __init efi_init(void) efi_enabled = 0; return; } +#ifdef CONFIG_X86_32 if (efi_native) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } +#endif #if EFI_DEBUG print_efi_memmap(); diff --git a/include/linux/efi.h b/include/linux/efi.h index 103adc6d7e3..ec45ccd8708 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -503,6 +503,8 @@ extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); +extern unsigned long efi_get_time(void); +extern int efi_set_rtc_mmss(unsigned long nowtime); extern void efi_reserve_boot_services(void); extern struct efi_memory_map memmap; diff --git a/init/main.c b/init/main.c index e60679de61c..b28673087ac 100644 --- a/init/main.c +++ b/init/main.c @@ -461,10 +461,6 @@ static void __init mm_init(void) percpu_init_late(); pgtable_cache_init(); vmalloc_init(); -#ifdef CONFIG_X86 - if (efi_enabled) - efi_enter_virtual_mode(); -#endif } asmlinkage void __init start_kernel(void) @@ -606,6 +602,10 @@ asmlinkage void __init start_kernel(void) calibrate_delay(); pidmap_init(); anon_vma_init(); +#ifdef CONFIG_X86 + if (efi_enabled) + efi_enter_virtual_mode(); +#endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); -- cgit v1.2.3-70-g09d2 From ca08649eb5dd30f11a5a8fe8659b48899b7ea6a1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 16 Aug 2012 11:31:27 -0400 Subject: Revert "xen PVonHVM: move shared_info to MMIO before kexec" This reverts commit 00e37bdb0113a98408de42db85be002f21dbffd3. During shutdown of PVHVM guests with more than 2VCPUs on certain machines we can hit the race where the replaced shared_info is not replaced fast enough and the PV time clock retries reading the same area over and over without any any success and is stuck in an infinite loop. Acked-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 118 +++++---------------------------------------- arch/x86/xen/suspend.c | 2 +- arch/x86/xen/xen-ops.h | 2 +- drivers/xen/platform-pci.c | 15 ------ include/xen/events.h | 2 - 5 files changed, 13 insertions(+), 126 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a6f8acbdfc9..f1814fc2cb7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include @@ -1472,130 +1471,38 @@ asmlinkage void __init xen_start_kernel(void) #endif } -#ifdef CONFIG_XEN_PVHVM -/* - * The pfn containing the shared_info is located somewhere in RAM. This - * will cause trouble if the current kernel is doing a kexec boot into a - * new kernel. The new kernel (and its startup code) can not know where - * the pfn is, so it can not reserve the page. The hypervisor will - * continue to update the pfn, and as a result memory corruption occours - * in the new kernel. - * - * One way to work around this issue is to allocate a page in the - * xen-platform pci device's BAR memory range. But pci init is done very - * late and the shared_info page is already in use very early to read - * the pvclock. So moving the pfn from RAM to MMIO is racy because some - * code paths on other vcpus could access the pfn during the small - * window when the old pfn is moved to the new pfn. There is even a - * small window were the old pfn is not backed by a mfn, and during that - * time all reads return -1. - * - * Because it is not known upfront where the MMIO region is located it - * can not be used right from the start in xen_hvm_init_shared_info. - * - * To minimise trouble the move of the pfn is done shortly before kexec. - * This does not eliminate the race because all vcpus are still online - * when the syscore_ops will be called. But hopefully there is no work - * pending at this point in time. Also the syscore_op is run last which - * reduces the risk further. - */ - -static struct shared_info *xen_hvm_shared_info; - -static void xen_hvm_connect_shared_info(unsigned long pfn) +void __ref xen_hvm_init_shared_info(void) { + int cpu; struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page = 0; + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = pfn; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); -} -static void xen_hvm_set_shared_info(struct shared_info *sip) -{ - int cpu; - - HYPERVISOR_shared_info = sip; + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_set_shared_info is run at resume time too and + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -/* Reconnect the shared_info pfn to a mfn */ -void xen_hvm_resume_shared_info(void) -{ - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); -} - -#ifdef CONFIG_KEXEC -static struct shared_info *xen_hvm_shared_info_kexec; -static unsigned long xen_hvm_shared_info_pfn_kexec; - -/* Remember a pfn in MMIO space for kexec reboot */ -void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) -{ - xen_hvm_shared_info_kexec = sip; - xen_hvm_shared_info_pfn_kexec = pfn; -} - -static void xen_hvm_syscore_shutdown(void) -{ - struct xen_memory_reservation reservation = { - .domid = DOMID_SELF, - .nr_extents = 1, - }; - unsigned long prev_pfn; - int rc; - - if (!xen_hvm_shared_info_kexec) - return; - - prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; - set_xen_guest_handle(reservation.extent_start, &prev_pfn); - - /* Move pfn to MMIO, disconnects previous pfn from mfn */ - xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); - - /* Update pointers, following hypercall is also a memory barrier */ - xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); - - /* Allocate new mfn for previous pfn */ - do { - rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc == 0) - msleep(123); - } while (rc == 0); - - /* Make sure the previous pfn is really connected to a (new) mfn */ - BUG_ON(rc != 1); -} - -static struct syscore_ops xen_hvm_syscore_ops = { - .shutdown = xen_hvm_syscore_shutdown, -}; -#endif - -/* Use a pfn in RAM, may move to MMIO before kexec. */ -static void __init xen_hvm_init_shared_info(void) -{ - /* Remember pointer for resume */ - xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); - xen_hvm_set_shared_info(xen_hvm_shared_info); -} - +#ifdef CONFIG_XEN_PVHVM static void __init init_hvm_pv_info(void) { int major, minor; @@ -1646,9 +1553,6 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); xen_hvm_init_shared_info(); -#ifdef CONFIG_KEXEC - register_syscore_ops(&xen_hvm_syscore_ops); -#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index ae8a00c39de..45329c8c226 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_resume_shared_info(); + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e4329e04e0..202d4c15015 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_resume_shared_info(void); +void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index d4c50d63acb..97ca359ae2b 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -101,19 +101,6 @@ static int platform_pci_resume(struct pci_dev *pdev) return 0; } -static void __devinit prepare_shared_info(void) -{ -#ifdef CONFIG_KEXEC - unsigned long addr; - struct shared_info *hvm_shared_info; - - addr = alloc_xen_mmio(PAGE_SIZE); - hvm_shared_info = ioremap(addr, PAGE_SIZE); - memset(hvm_shared_info, 0, PAGE_SIZE); - xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT); -#endif -} - static int __devinit platform_pci_init(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -151,8 +138,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, platform_mmio = mmio_addr; platform_mmiolen = mmio_len; - prepare_shared_info(); - if (!xen_have_vector_callback) { ret = xen_allocate_irq(pdev); if (ret) { diff --git a/include/xen/events.h b/include/xen/events.h index 9c641deb65d..04399b28e82 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -58,8 +58,6 @@ void notify_remote_via_irq(int irq); void xen_irq_resume(void); -void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn); - /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); void xen_set_irq_pending(int irq); -- cgit v1.2.3-70-g09d2 From 250a41e0ecc433cdd553a364d0fc74c766425209 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 09:27:35 -0400 Subject: xen/p2m: Reuse existing P2M leafs if they are filled with 1:1 PFNs or INVALID. If P2M leaf is completly packed with INVALID_P2M_ENTRY or with 1:1 PFNs (so IDENTITY_FRAME type PFNs), we can swap the P2M leaf with either a p2m_missing or p2m_identity respectively. The old page (which was created via extend_brk or was grafted on from the mfn_list) can be re-used for setting new PFNs. This also means we can remove git commit: 5bc6f9888db5739abfa0cae279b4b442e4db8049 xen/p2m: Reserve 8MB of _brk space for P2M leafs when populating back which tried to fix this. and make the amount that is required to be reserved much smaller. CC: stable@vger.kernel.org # for 3.5 only. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b2e91d40a4c..d4b25546325 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -196,9 +196,11 @@ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); /* When we populate back during bootup, the amount of pages can vary. The * max we have is seen is 395979, but that does not mean it can't be more. - * But some machines can have 3GB I/O holes even. So lets reserve enough - * for 4GB of I/O and E820 holes. */ -RESERVE_BRK(p2m_populated, PMD_SIZE * 4); + * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle + * it can re-use Xen provided mfn_list array, so we only need to allocate at + * most three P2M top nodes. */ +RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -575,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } + +/* + * Skim over the P2M tree looking at pages that are either filled with + * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and + * replace the P2M leaf with a p2m_missing or p2m_identity. + * Stick the old page in the new P2M tree location. + */ +bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +{ + unsigned topidx; + unsigned mididx; + unsigned ident_pfns; + unsigned inv_pfns; + unsigned long *p2m; + unsigned long *mid_mfn_p; + unsigned idx; + unsigned long pfn; + + /* We only look when this entails a P2M middle layer */ + if (p2m_index(set_pfn)) + return false; + + for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + p2m = p2m_top[topidx][mididx]; + if (!p2m) + continue; + + if ((p2m == p2m_missing) || (p2m == p2m_identity)) + continue; + + if ((unsigned long)p2m == INVALID_P2M_ENTRY) + continue; + + ident_pfns = 0; + inv_pfns = 0; + for (idx = 0; idx < P2M_PER_PAGE; idx++) { + /* IDENTITY_PFNs are 1:1 */ + if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) + ident_pfns++; + else if (p2m[idx] == INVALID_P2M_ENTRY) + inv_pfns++; + else + break; + } + if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) + goto found; + } + return false; +found: + /* Found one, replace old with p2m_identity or p2m_missing */ + p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); + /* And the other for save/restore.. */ + mid_mfn_p = p2m_top_mfn_p[topidx]; + /* NOTE: Even if it is a p2m_identity it should still be point to + * a page filled with INVALID_P2M_ENTRY entries. */ + mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); + + /* Reset where we want to stick the old page in. */ + topidx = p2m_top_index(set_pfn); + mididx = p2m_mid_index(set_pfn); + + /* This shouldn't happen */ + if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) + early_alloc_p2m(set_pfn); + + if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) + return false; + + p2m_init(p2m); + p2m_top[topidx][mididx] = p2m; + mid_mfn_p = p2m_top_mfn_p[topidx]; + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!early_alloc_p2m(pfn)) return false; + if (early_can_reuse_p2m_middle(pfn, mfn)) + return __set_phys_to_machine(pfn, mfn); + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) return false; -- cgit v1.2.3-70-g09d2 From 515c7af85ed92696c311c53d53cb4898ff32d784 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sat, 18 Aug 2012 16:11:37 -0400 Subject: x32: Use compat shims for {g,s}etsockopt Some of the arguments to {g,s}etsockopt are passed in userland pointers. If we try to use the 64bit entry point, we end up sometimes failing. For example, dhcpcd doesn't run in x32: # dhcpcd eth0 dhcpcd[1979]: version 5.5.6 starting dhcpcd[1979]: eth0: broadcasting for a lease dhcpcd[1979]: eth0: open_socket: Invalid argument dhcpcd[1979]: eth0: send_raw_packet: Bad file descriptor The code in particular is getting back EINVAL when doing: struct sock_fprog pf; setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf)); Diving into the kernel code, we can see: include/linux/filter.h: struct sock_fprog { unsigned short len; struct sock_filter __user *filter; }; net/core/sock.c: case SO_ATTACH_FILTER: ret = -EINVAL; if (optlen == sizeof(struct sock_fprog)) { struct sock_fprog fprog; ret = -EFAULT; if (copy_from_user(&fprog, optval, sizeof(fprog))) break; ret = sk_attach_filter(&fprog, sk); } break; arch/x86/syscalls/syscall_64.tbl: 54 common setsockopt sys_setsockopt 55 common getsockopt sys_getsockopt So for x64, sizeof(sock_fprog) is 16 bytes. For x86/x32, it's 8 bytes. This comes down to the pointer being 32bit for x32, which means we need to do structure size translation. But since x32 comes in directly to sys_setsockopt, it doesn't get translated like x86. After changing the syscall table and rebuilding glibc with the new kernel headers, dhcp runs fine in an x32 userland. Oddly, it seems like Linus noted the same thing during the initial port, but I guess that was missed/lost along the way: https://lkml.org/lkml/2011/8/26/452 [ hpa: tagging for -stable since this is an ABI fix. ] Bugzilla: https://bugs.gentoo.org/423649 Reported-by: Mads Signed-off-by: Mike Frysinger Link: http://lkml.kernel.org/r/1345320697-15713-1-git-send-email-vapier@gentoo.org Cc: H. J. Lu Cc: v3.4..v3.5 Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscall_64.tbl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 29aed7ac2c0..a582bfed95b 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -60,8 +60,8 @@ 51 common getsockname sys_getsockname 52 common getpeername sys_getpeername 53 common socketpair sys_socketpair -54 common setsockopt sys_setsockopt -55 common getsockopt sys_getsockopt +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt 56 common clone stub_clone 57 common fork stub_fork 58 common vfork stub_vfork @@ -353,3 +353,5 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt -- cgit v1.2.3-70-g09d2 From eb48c071464757414538c68a6033c8f8c15196f8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2012 16:15:52 -0700 Subject: mm: hugetlbfs: correctly populate shared pmd Each page mapped in a process's address space must be correctly accounted for in _mapcount. Normally the rules for this are straightforward but hugetlbfs page table sharing is different. The page table pages at the PMD level are reference counted while the mapcount remains the same. If this accounting is wrong, it causes bugs like this one reported by Larry Woodman: kernel BUG at mm/filemap.c:135! invalid opcode: 0000 [#1] SMP CPU 22 Modules linked in: bridge stp llc sunrpc binfmt_misc dcdbas microcode pcspkr acpi_pad acpi] Pid: 18001, comm: mpitest Tainted: G W 3.3.0+ #4 Dell Inc. PowerEdge R620/07NDJ2 RIP: 0010:[] [] __delete_from_page_cache+0x15d/0x170 Process mpitest (pid: 18001, threadinfo ffff880428972000, task ffff880428b5cc20) Call Trace: delete_from_page_cache+0x40/0x80 truncate_hugepages+0x115/0x1f0 hugetlbfs_evict_inode+0x18/0x30 evict+0x9f/0x1b0 iput_final+0xe3/0x1e0 iput+0x3e/0x50 d_kill+0xf8/0x110 dput+0xe2/0x1b0 __fput+0x162/0x240 During fork(), copy_hugetlb_page_range() detects if huge_pte_alloc() shared page tables with the check dst_pte == src_pte. The logic is if the PMD page is the same, they must be shared. This assumes that the sharing is between the parent and child. However, if the sharing is with a different process entirely then this check fails as in this diagram: parent | ------------>pmd src_pte----------> data page ^ other--------->pmd--------------------| ^ child-----------| dst_pte For this situation to occur, it must be possible for Parent and Other to have faulted and failed to share page tables with each other. This is possible due to the following style of race. PROC A PROC B copy_hugetlb_page_range copy_hugetlb_page_range src_pte == huge_pte_offset src_pte == huge_pte_offset !src_pte so no sharing !src_pte so no sharing (time passes) hugetlb_fault hugetlb_fault huge_pte_alloc huge_pte_alloc huge_pmd_share huge_pmd_share LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) pmd_alloc pmd_alloc LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) These two processes are not poing to the same data page but are not sharing page tables because the opportunity was missed. When either process later forks, the src_pte == dst pte is potentially insufficient. As the check falls through, the wrong PTE information is copied in (harmless but wrong) and the mapcount is bumped for a page mapped by a shared page table leading to the BUG_ON. This patch addresses the issue by moving pmd_alloc into huge_pmd_share which guarantees that the shared pud is populated in the same critical section as pmd. This also means that huge_pte_offset test in huge_pmd_share is serialized correctly now which in turn means that the success of the sharing will be higher as the racing tasks see the pud and pmd populated together. Race identified and changelog written mostly by Mel Gorman. {akpm@linux-foundation.org: attempt to make the huge_pmd_share() comment comprehensible, clean up coding style] Reported-by: Larry Woodman Tested-by: Larry Woodman Reviewed-by: Mel Gorman Signed-off-by: Michal Hocko Reviewed-by: Rik van Riel Cc: David Gibson Cc: Ken Chen Cc: Cong Wang Cc: Hillf Danton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f6679a7fb8c..b91e4851242 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) } /* - * search for a shareable pmd page for hugetlb. + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_mutex section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +static pte_t * +huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; @@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; + pte_t *pte; if (!vma_shareable(vma, addr)) - return; + return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { @@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); + return pte; } /* @@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else { BUG_ON(sz != PMD_SIZE); if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); } } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); -- cgit v1.2.3-70-g09d2 From 83be4ffa1acbcd529b771f4d2e639b15e2b7957e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 14 Aug 2012 14:47:37 -0700 Subject: x86/spinlocks: Fix comment in spinlock.h This comment is no longer true. We support up to 2^16 CPUs because __ticket_t is an u16 if NR_CPUS is larger than 256. Signed-off-by: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/include/asm/spinlock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b315a33867f..33692eaabab 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -12,8 +12,7 @@ * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. * - * These are fair FIFO ticket locks, which are currently limited to 256 - * CPUs. + * These are fair FIFO ticket locks, which support up to 2^16 CPUs. * * (the type definitions are in asm/spinlock_types.h) */ -- cgit v1.2.3-70-g09d2 From 2530cd4f448935c74eeb49f29559589928e4b2f0 Mon Sep 17 00:00:00 2001 From: "Liu, Chuansheng" Date: Tue, 14 Aug 2012 06:55:01 +0000 Subject: x86/fixup_irq: Use cpu_online_mask instead of cpu_all_mask When one CPU is going down and this CPU is the last one in irq affinity, current code is setting cpu_all_mask as the new affinity for that irq. But for some systems (such as in Medfield Android mobile) the firmware sends the interrupt to each CPU in the irq affinity mask, averaged, and cpu_all_mask includes all potential CPUs, i.e. offline ones as well. So replace cpu_all_mask with cpu_online_mask. Signed-off-by: liu chuansheng Acked-by: Yanmin Zhang Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A137286@SHSMSX101.ccr.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7ad683d7864..d44f7829968 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -270,7 +270,7 @@ void fixup_irqs(void) if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - affinity = cpu_all_mask; + affinity = cpu_online_mask; } chip = irq_data_get_irq_chip(data); -- cgit v1.2.3-70-g09d2 From cb09cad44f07044d9810f18f6f9a6a6f3771f979 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 22 Aug 2012 13:03:48 +0300 Subject: x86/alternatives: Fix p6 nops on non-modular kernels Probably a leftover from the early days of self-patching, p6nops are marked __initconst_or_module, which causes them to be discarded in a non-modular kernel. If something later triggers patching, it will overwrite kernel code with garbage. Reported-by: Tomas Racek Signed-off-by: Avi Kivity Cc: Michael Tokarev Cc: Borislav Petkov Cc: Marcelo Tosatti Cc: qemu-devel@nongnu.org Cc: Anthony Liguori Cc: H. Peter Anvin Cc: Alan Cox Cc: Alan Cox Link: http://lkml.kernel.org/r/5034AE84.90708@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index afb7ff79a29..ced4534baed 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -165,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = #endif #ifdef P6_NOP1 -static const unsigned char __initconst_or_module p6nops[] = +static const unsigned char p6nops[] = { P6_NOP1, P6_NOP2, -- cgit v1.2.3-70-g09d2 From 35f2d16bb9ace0fb2671b8232839944ad9057c6f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 20 Aug 2012 18:35:39 +0900 Subject: KVM: MMU: Fix mmu_shrink() so that it can free mmu pages as intended Although the possible race described in commit 85b7059169e128c57a3a8a3e588fb89cb2031da1 KVM: MMU: fix shrinking page from the empty mmu was correct, the real cause of that issue was a more trivial bug of mmu_shrink() introduced by commit 1952639665e92481c34c34c3e2a71bf3e66ba362 KVM: MMU: do not iterate over all VMs in mmu_shrink() Here is the bug: if (kvm->arch.n_used_mmu_pages > 0) { if (!nr_to_scan--) break; continue; } We skip VMs whose n_used_mmu_pages is not zero and try to shrink others: in other words we try to shrink empty ones by mistake. This patch reverses the logic so that mmu_shrink() can free pages from the first VM whose n_used_mmu_pages is not zero. Note that we also add comments explaining the role of nr_to_scan which is not practically important now, hoping this will be improved in the future. Signed-off-by: Takuya Yoshikawa Cc: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01ca0042393..7fbd0d273ea 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4112,17 +4112,22 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * Never scan more than sc->nr_to_scan VM instances. + * Will not hit this condition practically since we do not try + * to shrink more than one VM and it is very unlikely to see + * !n_used_mmu_pages so many times. + */ + if (!nr_to_scan--) + break; /* * n_used_mmu_pages is accessed without holding kvm->mmu_lock * here. We may skip a VM instance errorneosly, but we do not * want to shrink a VM that only started to populate its MMU * anyway. */ - if (kvm->arch.n_used_mmu_pages > 0) { - if (!nr_to_scan--) - break; + if (!kvm->arch.n_used_mmu_pages) continue; - } idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); -- cgit v1.2.3-70-g09d2 From 5ad105e569c45dcfad50d724c61d5061248be755 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 19 Aug 2012 14:34:31 +0300 Subject: KVM: x86 emulator: use stack size attribute to mask rsp in stack ops The sub-register used to access the stack (sp, esp, or rsp) is not determined by the address size attribute like other memory references, but by the stack segment's B bit (if not in x86_64 mode). Fix by using the existing stack_mask() to figure out the correct mask. This long-existing bug was exposed by a combination of a27685c33acccce (emulate invalid guest state by default), which causes many more instructions to be emulated, and a seabios change (possibly a bug) which causes the high 16 bits of esp to become polluted across calls to real mode software interrupts. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97d9a9914ba..a3b57a27be8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) return address_mask(ctxt, reg); } +static void masked_increment(ulong *reg, ulong mask, int inc) +{ + assign_masked(reg, *reg + inc, mask); +} + static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { + ulong mask; + if (ctxt->ad_bytes == sizeof(unsigned long)) - *reg += inc; + mask = ~0UL; else - *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); + mask = ad_mask(ctxt); + masked_increment(reg, mask, inc); +} + +static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) +{ + masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); } static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + rsp_increment(ctxt, -bytes); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, bytes); @@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, int rc; struct segmented_address addr; - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); + rsp_increment(ctxt, len); return rc; } @@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], - ctxt->op_bytes); + rsp_increment(ctxt, ctxt->op_bytes); --reg; } @@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); + rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } -- cgit v1.2.3-70-g09d2 From 36bf50d7697be18c6bfd0401e037df10bff1e573 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 31 Jul 2012 15:41:45 +0200 Subject: x86, microcode, AMD: Fix broken ucode patch size check This issue was recently observed on an AMD C-50 CPU where a patch of maximum size was applied. Commit be62adb49294 ("x86, microcode, AMD: Simplify ucode verification") added current_size in get_matching_microcode(). This is calculated as size of the ucode patch + 8 (ie. size of the header). Later this is compared against the maximum possible ucode patch size for a CPU family. And of course this fails if the patch has already maximum size. Cc: [3.3+] Signed-off-by: Andreas Herrmann Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-1-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 8a2ce8fd41c..82746f942cd 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, unsigned int *current_size) { struct microcode_header_amd *mc_hdr; - unsigned int actual_size; + unsigned int actual_size, patch_size; u16 equiv_cpu_id; /* size of the current patch we're staring at */ - *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + patch_size = *(u32 *)(ucode_ptr + 4); + *current_size = patch_size + SECTION_HDR_SIZE; equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) @@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, /* * now that the header looks sane, verify its size */ - actual_size = verify_ucode_size(cpu, *current_size, leftover_size); + actual_size = verify_ucode_size(cpu, patch_size, leftover_size); if (!actual_size) return 0; -- cgit v1.2.3-70-g09d2 From c96aae1f7f393387d160211f60398d58463a7e65 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 16:43:28 -0400 Subject: xen/setup: Fix one-off error when adding for-balloon PFNs to the P2M. When we are finished with return PFNs to the hypervisor, then populate it back, and also mark the E820 MMIO and E820 gaps as IDENTITY_FRAMEs, we then call P2M to set areas that can be used for ballooning. We were off by one, and ended up over-writting a P2M entry that most likely was an IDENTITY_FRAME. For example: 1-1 mapping on 40000->40200 1-1 mapping on bc558->bc5ac 1-1 mapping on bc5b4->bc8c5 1-1 mapping on bc8c6->bcb7c 1-1 mapping on bcd00->100000 Released 614 pages of unused memory Set 277889 page(s) to 1-1 mapping Populating 40200-40466 pfn range: 614 pages added => here we set from 40466 up to bc559 P2M tree to be INVALID_P2M_ENTRY. We should have done it up to bc558. The end result is that if anybody is trying to construct a PTE for PFN bc558 they end up with ~PAGE_PRESENT. CC: stable@vger.kernel.org Reported-by-and-Tested-by: Andre Przywara Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ead85576d54..d11ca11d14f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); xen_max_p2m_pfn = PFN_DOWN(start + size); + for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + continue; + WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); - for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } } static unsigned long __init xen_do_chunk(unsigned long start, -- cgit v1.2.3-70-g09d2 From 1d92128fe9e30c2340283361957a840f108e4abf Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Aug 2012 18:00:29 +0300 Subject: KVM: x86: fix KVM_GET_MSR for PV EOI KVM_GET_MSR was missing support for PV EOI, which is needed for migration. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dce75b76031..148ed666e31 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2000,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_STEAL_TIME: data = vcpu->arch.st.msr_val; break; + case MSR_KVM_PV_EOI_EN: + data = vcpu->arch.pv_eoi.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: -- cgit v1.2.3-70-g09d2 From 50e900417b8096939d12a46848f965e27a905e36 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 4 Sep 2012 15:45:17 -0400 Subject: xen/p2m: Fix one-off error in checking the P2M tree directory. We would traverse the full P2M top directory (from 0->MAX_DOMAIN_PAGES inclusive) when trying to figure out whether we can re-use some of the P2M middle leafs. Which meant that if the kernel was compiled with MAX_DOMAIN_PAGES=512 we would try to use the 512th entry. Fortunately for us the p2m_top_index has a check for this: BUG_ON(pfn >= MAX_P2M_PFN); which we hit and saw this: (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: (XEN) ----[ Xen-4.1.2-OVM x86_64 debug=n Tainted: C ]---- (XEN) CPU: 0 (XEN) RIP: e033:[] (XEN) RFLAGS: 0000000000000212 EM: 1 CONTEXT: pv guest (XEN) rax: ffffffff81db5000 rbx: ffffffff81db4000 rcx: 0000000000000000 (XEN) rdx: 0000000000480211 rsi: 0000000000000000 rdi: ffffffff81db4000 (XEN) rbp: ffffffff81793db8 rsp: ffffffff81793d38 r8: 0000000008000000 (XEN) r9: 4000000000000000 r10: 0000000000000000 r11: ffffffff81db7000 (XEN) r12: 0000000000000ff8 r13: ffffffff81df1ff8 r14: ffffffff81db6000 (XEN) r15: 0000000000000ff8 cr0: 000000008005003b cr4: 00000000000026f0 (XEN) cr3: 0000000661795000 cr2: 0000000000000000 Fixes-Oracle-Bug: 14570662 CC: stable@vger.kernel.org # only for v3.5 Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index d4b25546325..76ba0e97e53 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -599,7 +599,7 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_ if (p2m_index(set_pfn)) return false; - for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { topidx = p2m_top_index(pfn); if (!p2m_top[topidx]) -- cgit v1.2.3-70-g09d2 From ce7184bdbd38d920fb515266fbbdc585ad2e5493 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 24 Aug 2012 08:55:13 +0000 Subject: xen: fix logical error in tlb flushing While TLB_FLUSH_ALL gets passed as 'end' argument to flush_tlb_others(), the Xen code was made to check its 'start' parameter. That may give a incorrect op.cmd to MMUEXT_INVLPG_MULTI instead of MMUEXT_TLB_FLUSH_MULTI. Then it causes some page can not be flushed from TLB. This patch fixed this issue. Reported-by: Jan Beulich Signed-off-by: Alex Shi Acked-by: Jan Beulich Tested-by: Yongjie Ren Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a76133f4..5141d808e75 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; args->op.arg1.linear_addr = start; } -- cgit v1.2.3-70-g09d2 From 73090f8993a40a2f67fed1ab866a928c68cd3765 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:37 +0100 Subject: x86: Remove base argument from x86_init.paging.pagetable_setup_start We either use swapper_pg_dir or the argument is unused. Preparatory patch to simplify platform pagetable setup further. Signed-off-by: Attilio Rao Ackedb-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-2-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 6 +++--- arch/x86/include/asm/x86_init.h | 2 +- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/x86_init.c | 3 ++- arch/x86/mm/init_32.c | 4 ++-- arch/x86/xen/mmu.c | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 013286a10c2..e02b875e692 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -303,11 +303,11 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 -extern void native_pagetable_setup_start(pgd_t *base); +extern void native_pagetable_setup_start(void); extern void native_pagetable_setup_done(pgd_t *base); #else -#define native_pagetable_setup_start x86_init_pgd_noop -#define native_pagetable_setup_done x86_init_pgd_noop +#define native_pagetable_setup_start x86_init_pgd_start_noop +#define native_pagetable_setup_done x86_init_pgd_done_noop #endif struct seq_file; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 38155f66714..782ba0c4b26 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -85,7 +85,7 @@ struct x86_init_mapping { * @pagetable_setup_done: platform specific post paging_init() call */ struct x86_init_paging { - void (*pagetable_setup_start)(pgd_t *base); + void (*pagetable_setup_start)(void); void (*pagetable_setup_done)(pgd_t *base); }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f4b9b80e1b9..90cbbe00adc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -961,7 +961,7 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - x86_init.paging.pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(); paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9f3167e891e..3b88493ec7c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,8 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_noop(pgd_t *unused) { } +void __init x86_init_pgd_start_noop(void) { } +void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 575d86f85ce..c4aa1b25ba3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -445,10 +445,10 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(pgd_t *base) +void __init native_pagetable_setup_start(void) { unsigned long pfn, va; - pgd_t *pgd; + pgd_t *pgd, *base = swapper_pg_dir; pud_t *pud; pmd_t *pmd; pte_t *pte; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5141d808e75..32e66c8d014 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(pgd_t *base) +static void __init xen_pagetable_setup_start(void) { } -- cgit v1.2.3-70-g09d2 From 7737b215ad0f94d20a87d98315da9f6cadaf35c9 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:38 +0100 Subject: x86: Rename pagetable_setup_start() to pagetable_init() In preparation for unifying the pagetable_setup_start() and pagetable_setup_done() setup functions, rename appropriately all the infrastructure related to pagetable_setup_start(). Signed-off-by: Attilio Rao Ackedd-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-3-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/include/asm/x86_init.h | 4 ++-- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/x86_init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/xen/mmu.c | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e02b875e692..0c01e0730f7 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -303,10 +303,10 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 -extern void native_pagetable_setup_start(void); +extern void native_pagetable_init(void); extern void native_pagetable_setup_done(pgd_t *base); #else -#define native_pagetable_setup_start x86_init_pgd_start_noop +#define native_pagetable_init x86_init_pgd_init_noop #define native_pagetable_setup_done x86_init_pgd_done_noop #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 782ba0c4b26..24084b2b3a4 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -81,11 +81,11 @@ struct x86_init_mapping { /** * struct x86_init_paging - platform specific paging functions - * @pagetable_setup_start: platform specific pre paging_init() call + * @pagetable_init: platform specific paging initialization call * @pagetable_setup_done: platform specific post paging_init() call */ struct x86_init_paging { - void (*pagetable_setup_start)(void); + void (*pagetable_init)(void); void (*pagetable_setup_done)(pgd_t *base); }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 90cbbe00adc..61b7d9827af 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -961,7 +961,7 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - x86_init.paging.pagetable_setup_start(); + x86_init.paging.pagetable_init(); paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3b88493ec7c..0e1e950113b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_start_noop(void) { } +void __init x86_init_pgd_init_noop(void) { } void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } @@ -69,7 +69,7 @@ struct x86_init_ops x86_init __initdata = { }, .paging = { - .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_init = native_pagetable_init, .pagetable_setup_done = native_pagetable_setup_done, }, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c4aa1b25ba3..0e38e0e8804 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -445,7 +445,7 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(void) +void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; @@ -493,7 +493,7 @@ void __init native_pagetable_setup_done(pgd_t *base) * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may * or may not be based in swapper_pg_dir. In any case, - * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * paravirt_pagetable_init() will set up swapper_pg_dir * appropriately for the rest of the initialization to work. * * In general, pagetable_init() assumes that the pagetable may already diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 32e66c8d014..624efbefb94 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(void) +static void __init xen_pagetable_init(void) { } @@ -2068,7 +2068,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; - x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; + x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; -- cgit v1.2.3-70-g09d2 From 843b8ed2ec598aae5e3516b21957ede62a070e36 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:39 +0100 Subject: x86: Move paging_init() call to x86_init.paging.pagetable_init() Move the paging_init() call to the platform specific pagetable_init() function, so we can get rid of the extra pagetable_setup_done() function pointer. Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-4-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 2 +- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/x86_init.c | 1 - arch/x86/mm/init_32.c | 1 + arch/x86/xen/mmu.c | 1 + 5 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 0c01e0730f7..c93cb8eec7c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -306,7 +306,7 @@ extern void native_pagetable_reserve(u64 start, u64 end); extern void native_pagetable_init(void); extern void native_pagetable_setup_done(pgd_t *base); #else -#define native_pagetable_init x86_init_pgd_init_noop +#define native_pagetable_init paging_init #define native_pagetable_setup_done x86_init_pgd_done_noop #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 61b7d9827af..315fd24131e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -962,7 +962,6 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.paging.pagetable_init(); - paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); if (boot_cpu_data.cpuid_level >= 0) { diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 0e1e950113b..5f2478fb3d6 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,6 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_init_noop(void) { } void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e38e0e8804..e35b4b17189 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -475,6 +475,7 @@ void __init native_pagetable_init(void) pte_clear(NULL, va, pte); } paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); + paging_init(); } void __init native_pagetable_setup_done(pgd_t *base) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 624efbefb94..c2ff7ea37b8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1176,6 +1176,7 @@ static void xen_exit_mmap(struct mm_struct *mm) static void __init xen_pagetable_init(void) { + paging_init(); } static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -- cgit v1.2.3-70-g09d2 From c711288727a62f74d48032e56e51333dd104bf58 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:40 +0100 Subject: x86: xen: Cleanup and remove x86_init.paging.pagetable_setup_done() At this stage x86_init.paging.pagetable_setup_done is only used in the XEN case. Move its content in the x86_init.paging.pagetable_init setup function and remove the now unused x86_init.paging.pagetable_setup_done remaining infrastructure. Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-5-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 2 -- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/x86_init.c | 2 -- arch/x86/mm/init_32.c | 4 ---- arch/x86/xen/mmu.c | 13 ++++--------- 6 files changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index c93cb8eec7c..db8fec6d295 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -304,10 +304,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); -extern void native_pagetable_setup_done(pgd_t *base); #else #define native_pagetable_init paging_init -#define native_pagetable_setup_done x86_init_pgd_done_noop #endif struct seq_file; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 24084b2b3a4..995ea5c3fbf 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -82,11 +82,9 @@ struct x86_init_mapping { /** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call - * @pagetable_setup_done: platform specific post paging_init() call */ struct x86_init_paging { void (*pagetable_init)(void); - void (*pagetable_setup_done)(pgd_t *base); }; /** diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 315fd24131e..4f165479c45 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -962,7 +962,6 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.paging.pagetable_init(); - x86_init.paging.pagetable_setup_done(swapper_pg_dir); if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 5f2478fb3d6..7a3d075a814 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,6 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } @@ -69,7 +68,6 @@ struct x86_init_ops x86_init __initdata = { .paging = { .pagetable_init = native_pagetable_init, - .pagetable_setup_done = native_pagetable_setup_done, }, .timers = { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e35b4b17189..4f04db15002 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -478,10 +478,6 @@ void __init native_pagetable_init(void) paging_init(); } -void __init native_pagetable_setup_done(pgd_t *base) -{ -} - /* * Build a proper pagetable for the kernel mappings. Up until this * point, we've been running on some set of pagetables constructed by diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c2ff7ea37b8..7a769b7526c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,9 +1174,13 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } +static void xen_post_allocator_init(void); + static void __init xen_pagetable_init(void) { paging_init(); + xen_setup_shared_info(); + xen_post_allocator_init(); } static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) @@ -1193,14 +1197,6 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) } } -static void xen_post_allocator_init(void); - -static void __init xen_pagetable_setup_done(pgd_t *base) -{ - xen_setup_shared_info(); - xen_post_allocator_init(); -} - static void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; @@ -2070,7 +2066,6 @@ void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); -- cgit v1.2.3-70-g09d2 From 64282278989d5b0398dcb3ba7904cb00c621dc35 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:41 +0100 Subject: x86: Document x86_init.paging.pagetable_init() Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-6-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 995ea5c3fbf..57693498519 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -81,7 +81,10 @@ struct x86_init_mapping { /** * struct x86_init_paging - platform specific paging functions - * @pagetable_init: platform specific paging initialization call + * @pagetable_init: platform specific paging initialization call to setup + * the kernel pagetables and prepare accessors functions. + * Callback must call paging_init(). Called once after the + * direct mapping for phys memory is available. */ struct x86_init_paging { void (*pagetable_init)(void); -- cgit v1.2.3-70-g09d2