diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 11 | ||||
-rw-r--r-- | arch/x86/include/asm/preempt.h | 11 | ||||
-rw-r--r-- | arch/x86/include/asm/timer.h | 77 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.c | 16 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.h | 15 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 318 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 13 | ||||
-rw-r--r-- | arch/x86/platform/uv/tlb_uv.c | 66 | ||||
-rw-r--r-- | arch/x86/syscalls/syscall_32.tbl | 2 | ||||
-rw-r--r-- | arch/x86/syscalls/syscall_64.tbl | 2 |
15 files changed, 383 insertions, 164 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e903c71f7e6..0952ecd60ec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,6 +26,7 @@ config X86 select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3d199945870..bbc8b12fa44 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -452,9 +452,16 @@ static inline int pte_present(pte_t a) } #define pte_accessible pte_accessible -static inline int pte_accessible(pte_t a) +static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { - return pte_flags(a) & _PAGE_PRESENT; + if (pte_flags(a) & _PAGE_PRESENT) + return true; + + if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + mm_tlb_flush_pending(mm)) + return true; + + return false; } static inline int pte_hidden(pte_t pte) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 8729723636f..c8b051933b1 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -8,6 +8,12 @@ DECLARE_PER_CPU(int, __preempt_count); /* + * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such + * that a decrement hitting 0 means we can and should reschedule. + */ +#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) + +/* * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users * that think a non-zero value indicates we cannot preempt. */ @@ -74,6 +80,11 @@ static __always_inline void __preempt_count_sub(int val) __this_cpu_add_4(__preempt_count, -val); } +/* + * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule + * a decrement which hits zero means we have no preempt_count and should + * reschedule. + */ static __always_inline bool __preempt_count_dec_and_test(void) { GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0eb5d0..3de54ef0aea 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -4,6 +4,7 @@ #include <linux/pm.h> #include <linux/percpu.h> #include <linux/interrupt.h> +#include <linux/math64.h> #define TICK_SIZE (tick_nsec / 1000) @@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) +/* + * We use the full linear equation: f(x) = a + b*x, in order to allow + * a continuous function in the face of dynamic freq changes. * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC + * Continuity means that when our frequency changes our slope (b); we want to + * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. * - * And since SC is a constant power of two, we can convert the div - * into a shift. + * Without an offset (a) the above would not be possible. * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - * - * In: - * - * ns = cycles * cyc2ns_scale / SC - * - * Although we may still have enough bits to store the value of ns, - * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, - * leading to an incorrect result. - * - * To avoid this, we can decompose 'cycles' into quotient and remainder - * of division by SC. Then, - * - * ns = (quot * SC + rem) * cyc2ns_scale / SC - * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC - * - * - sqazi@google.com + * See the comment near cycles_2_ns() for details on how we compute (b). */ - -DECLARE_PER_CPU(unsigned long, cyc2ns); -DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); - -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline unsigned long long __cycles_2_ns(unsigned long long cyc) -{ - int cpu = smp_processor_id(); - unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), - (1UL << CYC2NS_SCALE_FACTOR)); - return ns; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = __cycles_2_ns(cyc); - local_irq_restore(flags); - - return ns; -} +struct cyc2ns_data { + u32 cyc2ns_mul; + u32 cyc2ns_shift; + u64 cyc2ns_offset; + u32 __count; + /* u32 hole */ +}; /* 24 bytes -- do not grow */ + +extern struct cyc2ns_data *cyc2ns_read_begin(void); +extern void cyc2ns_read_end(struct cyc2ns_data *); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023bdd6b..8bc79cddd9a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ea04b342c02..1a439c047ff 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e132931614..b88645191fe 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,21 +1883,27 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + struct cyc2ns_data *data; + userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; + data = cyc2ns_read_begin(); + userpg->cap_user_time = 1; - userpg->time_mult = this_cpu_read(cyc2ns); - userpg->time_shift = CYC2NS_SCALE_FACTOR; - userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; userpg->cap_user_time_zero = 1; - userpg->time_zero = this_cpu_read(cyc2ns_offset); + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fd00bb29425..c1a861829d8 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -262,11 +262,20 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) +/* + * We define the end marker as having a weight of -1 + * to enable blacklisting of events using a counter bitmask + * of zero and thus a weight of zero. + * The end marker has a weight that cannot possibly be + * obtained from counting the bits in the bitmask. + */ +#define EVENT_CONSTRAINT_END { .weight = -1 } +/* + * Check for end marker with weight == -1 + */ #define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->weight; (e)++) + for ((e) = (c); (e)->weight != -1; (e)++) /* * Extra registers for specific events. diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f56..6377fb28b95 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@ #include <linux/clocksource.h> #include <linux/percpu.h> #include <linux/timex.h> +#include <linux/static_key.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; +static struct static_key __use_tsc = STATIC_KEY_INIT; + int tsc_clocksource_reliable; + +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ + struct cyc2ns_data *head; /* 48 + 8 = 56 */ + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ + struct cyc2ns_data *head; + + preempt_disable(); + + head = this_cpu_read(cyc2ns.head); + /* + * Ensure we observe the entry when we observe the pointer to it. + * matches the wmb from cyc2ns_write_end(). + */ + smp_read_barrier_depends(); + head->__count++; + barrier(); + + return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ + barrier(); + /* + * If we're the outer most nested read; update the tail pointer + * when we're done. This notifies possible pending writers + * that we've observed the head pointer and that the other + * entry is now free. + */ + if (!--head->__count) { + /* + * x86-TSO does not reorder writes with older reads; + * therefore once this write becomes visible to another + * cpu, we must be finished reading the cyc2ns_data. + * + * matches with cyc2ns_write_begin(). + */ + this_cpu_write(cyc2ns.tail, head); + } + preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + struct cyc2ns_data *data = c2n->data; + + if (data == c2n->head) + data++; + + /* XXX send an IPI to @cpu in order to guarantee a read? */ + + /* + * When we observe the tail write from cyc2ns_read_end(), + * the cpu must be done with that entry and its safe + * to start writing to it. + */ + while (c2n->tail == data) + cpu_relax(); + + return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + /* + * Ensure the @data writes are visible before we publish the + * entry. Matches the data-depencency in cyc2ns_read_begin(). + */ + smp_wmb(); + + ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ + data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = 0; + data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + cyc2ns_data_init(&c2n->data[0]); + cyc2ns_data_init(&c2n->data[1]); + + c2n->head = c2n->data; + c2n->tail = c2n->data; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + struct cyc2ns_data *data, *tail; + unsigned long long ns; + + /* + * See cyc2ns_read_*() for details; replicated in order to avoid + * an extra few instructions that came with the abstraction. + * Notable, it allows us to only do the __count and tail update + * dance when its actually needed. + */ + + preempt_disable(); + data = this_cpu_read(cyc2ns.head); + tail = this_cpu_read(cyc2ns.tail); + + if (likely(data == tail)) { + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + } else { + data->__count++; + + barrier(); + + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + barrier(); + + if (!--data->__count) + this_cpu_write(cyc2ns.tail, data); + } + preempt_enable(); + + return ns; +} + +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now; + struct cyc2ns_data *data; + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (!cpu_khz) + goto done; + + data = cyc2ns_write_begin(cpu); + + rdtscll(tsc_now); + ns_now = cycles_2_ns(tsc_now); + + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + cyc2ns_write_end(cpu, data); + +done: + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} /* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) { - u64 this_offset; + u64 tsc_now; /* * Fall back to jiffies if there's no TSC available: @@ -53,16 +285,16 @@ u64 native_sched_clock(void) * very important for it to be as fast as the platform * can achieve it. ) */ - if (unlikely(tsc_disabled)) { + if (!static_key_false(&__use_tsc)) { /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* read the Time Stamp Counter: */ - rdtscll(this_offset); + rdtscll(tsc_now); /* return the value in ns */ - return __cycles_2_ns(this_offset); + return cycles_2_ns(tsc_now); } /* We need to define a real function for sched_clock, to override the @@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable) + if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; local_irq_save(flags); - __this_cpu_write(cyc2ns_offset, 0); + /* + * We're comming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + offset = cyc2ns_suspend - sched_clock(); - for_each_possible_cpu(cpu) - per_cpu(cyc2ns_offset, cpu) = offset; + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } local_irq_restore(flags); } @@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - sched_clock_stable = 0; + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ @@ -995,14 +1187,18 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + cyc2ns_init(cpu); set_cyc2ns_scale(cpu_khz, cpu); + } if (tsc_disabled > 0) return; /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + static_key_slow_inc(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dec48bfaddb..1673940cf9c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1350,6 +1350,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) return; } + if (!kvm_vcpu_is_bsp(apic->vcpu)) + value &= ~MSR_IA32_APICBASE_BSP; + vcpu->arch.apic_base = value; + /* update jump label if enable bit changes */ if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) @@ -1359,10 +1363,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) recalculate_apic_map(vcpu->kvm); } - if (!kvm_vcpu_is_bsp(apic->vcpu)) - value &= ~MSR_IA32_APICBASE_BSP; - - vcpu->arch.apic_base = value; if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) { u32 id = kvm_apic_id(apic); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b2fe1c252f3..da7837e1349 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8283,8 +8283,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - if (nested_cpu_has_ept(vmcs12)) - nested_ept_uninit_mmu_context(vcpu); + nested_ept_uninit_mmu_context(vcpu); kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dd74e46828c..0596e8e0cc1 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_t pte = gup_get_pte(ptep); struct page *page; + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_numa(pte)) { + pte_unmap(ptep); + return 0; + } + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; @@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; } else { diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index efe4d722039..dfe605ac1bc 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) return; } -static inline unsigned long cycles_2_us(unsigned long long cyc) +/* + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative + * number, not an absolute. It converts a duration in cycles to a duration in + * ns. + */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc) { + struct cyc2ns_data *data = cyc2ns_read_begin(); unsigned long long ns; - unsigned long us; - int cpu = smp_processor_id(); - ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; + ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + + cyc2ns_read_end(data); + return ns; +} + +/* + * The reverse of the above; converts a duration in ns to a duration in cycles. + */ +static inline unsigned long long ns_2_cycles(unsigned long long ns) +{ + struct cyc2ns_data *data = cyc2ns_read_begin(); + unsigned long long cyc; + + cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + + cyc2ns_read_end(data); + return cyc; +} + +static inline unsigned long cycles_2_us(unsigned long long cyc) +{ + return cycles_2_ns(cyc) / NSEC_PER_USEC; +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + return ns_2_cycles(sec * NSEC_PER_SEC); +} + +static inline unsigned long long usec_2_cycles(unsigned long usec) +{ + return ns_2_cycles(usec * NSEC_PER_USEC); } /* @@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc, bcp, try); } -static inline cycles_t sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Our retries are blocked by all destination sw ack resources being * in use, and a timeout is pending. In that case hardware immediately @@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data) { } -static inline unsigned long long usec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Display the statistics thru /proc/sgi_uv/ptc_statistics * 'data' points to the cpu number diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index aabfb8380a1..96bc506ac6d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -357,3 +357,5 @@ 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 349 i386 kcmp sys_kcmp 350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 38ae65dfd14..a12bddc7cce 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -320,6 +320,8 @@ 311 64 process_vm_writev sys_process_vm_writev 312 common kcmp sys_kcmp 313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr # # x32-specific system call numbers start at 512 to avoid cache impact |