From a312b37b2a212fd2e227d1d6321f903b91b65ec7 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:23 -0700 Subject: x86/paravirt: call paravirt_pagetable_setup_{start, done} Call paravirt_pagetable_setup_{start,done} These paravirt_ops functions were not being called on x86_64. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel/paravirt.c') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c1..2963ab5d91e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -373,6 +374,9 @@ struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#else + .pagetable_setup_start = paravirt_nop, + .pagetable_setup_done = paravirt_nop, #endif .read_cr2 = native_read_cr2, -- cgit v1.2.3-70-g09d2 From 74d4affde8feb8d5bdebf7fba8e90e4eae3b7b1d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:50 -0700 Subject: x86/paravirt: add hooks for spinlock operations Ticket spinlocks have absolutely ghastly worst-case performance characteristics in a virtual environment. If there is any contention for physical CPUs (ie, there are more runnable vcpus than cpus), then ticket locks can cause the system to end up spending 90+% of its time spinning. The problem is that (v)cpus waiting on a ticket spinlock will be granted access to the lock in strict order they got their tickets. If the hypervisor scheduler doesn't give the vcpus time in that order, they will burn timeslices waiting for the scheduler to give the right vcpu some time. In the worst case it could take O(n^2) vcpu scheduler timeslices for everyone waiting on the lock to get it, not counting new cpus trying to take the lock while the log-jam is sorted out. These hooks allow a paravirt backend to replace the spinlock implementation. At the very least, this could revert the implementation back to the old lock algorithm, which allows the next scheduled vcpu to take the lock, and has basically fairly good performance. It also allows the spinlocks to take advantages of the hypervisor features to make locks more efficient (spin and block, for example). The cost to native execution is an extra direct call when using a spinlock function. There's no overhead if CONFIG_PARAVIRT is turned off. The lock structure is fixed at a single "unsigned int", initialized to zero, but the spinlock implementation can use it as it wishes. Thanks to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around" for pointing out this problem. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 10 ++++++++ include/asm-x86/paravirt.h | 37 +++++++++++++++++++++++++++ include/asm-x86/spinlock.h | 55 +++++++++++++++++++++++++++++----------- include/asm-x86/spinlock_types.h | 2 +- 4 files changed, 88 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel/paravirt.c') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 2963ab5d91e..f3381686870 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -124,6 +124,7 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, + .pv_lock_ops = pv_lock_ops, }; return *((void **)&tmpl + type); } @@ -450,6 +451,15 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; +struct pv_lock_ops pv_lock_ops = { + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +}; + EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index eef8095a09d..feb6bb66c5e 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -326,6 +326,15 @@ struct pv_mmu_ops { unsigned long phys, pgprot_t flags); }; +struct raw_spinlock; +struct pv_lock_ops { + int (*spin_is_locked)(struct raw_spinlock *lock); + int (*spin_is_contended)(struct raw_spinlock *lock); + void (*spin_lock)(struct raw_spinlock *lock); + int (*spin_trylock)(struct raw_spinlock *lock); + void (*spin_unlock)(struct raw_spinlock *lock); +}; + /* This contains all the paravirt structures: we get a convenient * number for each function using the offset which we use to indicate * what to patch. */ @@ -336,6 +345,7 @@ struct paravirt_patch_template { struct pv_irq_ops pv_irq_ops; struct pv_apic_ops pv_apic_ops; struct pv_mmu_ops pv_mmu_ops; + struct pv_lock_ops pv_lock_ops; }; extern struct pv_info pv_info; @@ -345,6 +355,7 @@ extern struct pv_cpu_ops pv_cpu_ops; extern struct pv_irq_ops pv_irq_ops; extern struct pv_apic_ops pv_apic_ops; extern struct pv_mmu_ops pv_mmu_ops; +extern struct pv_lock_ops pv_lock_ops; #define PARAVIRT_PATCH(x) \ (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) @@ -1374,6 +1385,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, void _paravirt_nop(void); #define paravirt_nop ((void *)_paravirt_nop) +static inline int __raw_spin_is_locked(struct raw_spinlock *lock) +{ + return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); +} + +static inline int __raw_spin_is_contended(struct raw_spinlock *lock) +{ + return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); +} + +static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) +{ + return PVOP_VCALL1(pv_lock_ops.spin_lock, lock); +} + +static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) +{ + return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); +} + +static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) +{ + return PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); +} + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch_site { u8 *instr; /* original instructions */ @@ -1458,6 +1494,7 @@ static inline unsigned long __raw_local_irq_save(void) return f; } + /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef __PVOP_CALL diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h index 21e89bf92f1..9726144cdab 100644 --- a/include/asm-x86/spinlock.h +++ b/include/asm-x86/spinlock.h @@ -6,7 +6,7 @@ #include #include #include - +#include /* * Your basic SMP spinlocks, allowing only a single CPU anywhere * @@ -54,21 +54,21 @@ * much between them in performance though, especially as locks are out of line. */ #if (NR_CPUS < 256) -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return (((tmp >> 8) & 0xff) != (tmp & 0xff)); } -static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1; } -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) { short inc = 0x0100; @@ -87,9 +87,7 @@ static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) : "memory", "cc"); } -#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) - -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) { int tmp; short new; @@ -110,7 +108,7 @@ static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) return tmp; } -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incb %0" : "+m" (lock->slock) @@ -118,21 +116,21 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) : "memory", "cc"); } #else -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return (((tmp >> 16) & 0xffff) != (tmp & 0xffff)); } -static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1; } -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) { int inc = 0x00010000; int tmp; @@ -153,9 +151,7 @@ static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) : "memory", "cc"); } -#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) - -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) { int tmp; int new; @@ -177,7 +173,7 @@ static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) return tmp; } -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incw %0" : "+m" (lock->slock) @@ -186,6 +182,35 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) } #endif +#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) + +#ifndef CONFIG_PARAVIRT +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + return __ticket_spin_is_locked(lock); +} + +static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +{ + return __ticket_spin_is_contended(lock); +} + +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +{ + __ticket_spin_lock(lock); +} + +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +{ + return __ticket_spin_trylock(lock); +} + +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +{ + __ticket_spin_unlock(lock); +} +#endif /* CONFIG_PARAVIRT */ + static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) { while (__raw_spin_is_locked(lock)) diff --git a/include/asm-x86/spinlock_types.h b/include/asm-x86/spinlock_types.h index 9029cf78cf5..06c071c9eee 100644 --- a/include/asm-x86/spinlock_types.h +++ b/include/asm-x86/spinlock_types.h @@ -5,7 +5,7 @@ # error "please don't include this file directly" #endif -typedef struct { +typedef struct raw_spinlock { unsigned int slock; } raw_spinlock_t; -- cgit v1.2.3-70-g09d2 From 8efcbab674de2bee45a2e4cdf97de16b8e609ac8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:51 -0700 Subject: paravirt: introduce a "lock-byte" spinlock implementation Implement a version of the old spinlock algorithm, in which everyone spins waiting for a lock byte. In order to be compatible with the ticket-lock's use of a zero initializer, this uses the convention of '0' for unlocked and '1' for locked. This algorithm is much better than ticket locks in a virtual envionment, because it doesn't interact badly with the vcpu scheduler. If there are multiple vcpus spinning on a lock and the lock is released, the next vcpu to be scheduled will take the lock, rather than cycling around until the next ticketed vcpu gets it. To use this, you must call paravirt_use_bytelocks() very early, before any spinlocks have been taken. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 9 +++++++ include/asm-x86/paravirt.h | 2 ++ include/asm-x86/spinlock.h | 65 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 75 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/paravirt.c') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f3381686870..bba4041bb7f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,6 +268,15 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void __init paravirt_use_bytelocks(void) +{ + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index feb6bb66c5e..65ed02cdbbd 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -1385,6 +1385,8 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, void _paravirt_nop(void); #define paravirt_nop ((void *)_paravirt_nop) +void paravirt_use_bytelocks(void); + static inline int __raw_spin_is_locked(struct raw_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h index 9726144cdab..4f9a9861799 100644 --- a/include/asm-x86/spinlock.h +++ b/include/asm-x86/spinlock.h @@ -184,7 +184,70 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) -#ifndef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT +/* + * Define virtualization-friendly old-style lock byte lock, for use in + * pv_lock_ops if desired. + * + * This differs from the pre-2.6.24 spinlock by always using xchgb + * rather than decb to take the lock; this allows it to use a + * zero-initialized lock structure. It also maintains a 1-byte + * contention counter, so that we can implement + * __byte_spin_is_contended. + */ +struct __byte_spinlock { + s8 lock; + s8 spinners; +}; + +static inline int __byte_spin_is_locked(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + return bl->lock != 0; +} + +static inline int __byte_spin_is_contended(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + return bl->spinners != 0; +} + +static inline void __byte_spin_lock(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + s8 val = 1; + + asm("1: xchgb %1, %0\n" + " test %1,%1\n" + " jz 3f\n" + " " LOCK_PREFIX "incb %2\n" + "2: rep;nop\n" + " cmpb $1, %0\n" + " je 2b\n" + " " LOCK_PREFIX "decb %2\n" + " jmp 1b\n" + "3:" + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory"); +} + +static inline int __byte_spin_trylock(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + u8 old = 1; + + asm("xchgb %1,%0" + : "+m" (bl->lock), "+q" (old) : : "memory"); + + return old == 0; +} + +static inline void __byte_spin_unlock(raw_spinlock_t *lock) +{ + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; + smp_wmb(); + bl->lock = 0; +} +#else /* !CONFIG_PARAVIRT */ static inline int __raw_spin_is_locked(raw_spinlock_t *lock) { return __ticket_spin_is_locked(lock); -- cgit v1.2.3-70-g09d2 From 4bb689eee12ceb6d669a0c9a519037c049a8af38 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:33:33 +0200 Subject: x86: paravirt spinlocks, !CONFIG_SMP build fixes Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 4 ++++ include/asm-x86/paravirt.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86/kernel/paravirt.c') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bba4041bb7f..6aa8aed06d5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,11 +270,13 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) void __init paravirt_use_bytelocks(void) { +#ifdef CONFIG_SMP pv_lock_ops.spin_is_locked = __byte_spin_is_locked; pv_lock_ops.spin_is_contended = __byte_spin_is_contended; pv_lock_ops.spin_lock = __byte_spin_lock; pv_lock_ops.spin_trylock = __byte_spin_trylock; pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif } struct pv_info pv_info = { @@ -461,12 +463,14 @@ struct pv_mmu_ops pv_mmu_ops = { }; struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP .spin_is_locked = __ticket_spin_is_locked, .spin_is_contended = __ticket_spin_is_contended, .spin_lock = __ticket_spin_lock, .spin_trylock = __ticket_spin_trylock, .spin_unlock = __ticket_spin_unlock, +#endif }; EXPORT_SYMBOL_GPL(pv_time_ops); diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 65ed02cdbbd..b2aba8fdaae 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -1387,6 +1387,8 @@ void _paravirt_nop(void); void paravirt_use_bytelocks(void); +#ifdef CONFIG_SMP + static inline int __raw_spin_is_locked(struct raw_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); @@ -1412,6 +1414,8 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) return PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); } +#endif + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch_site { u8 *instr; /* original instructions */ -- cgit v1.2.3-70-g09d2 From 9af98578d6af588f52d0dacd64fe42caa405a327 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:39:15 +0200 Subject: x86: paravirt spinlocks, modular build fix fix: MODPOST 408 modules ERROR: "pv_lock_ops" [net/dccp/dccp.ko] undefined! ERROR: "pv_lock_ops" [fs/jbd2/jbd2.ko] undefined! ERROR: "pv_lock_ops" [drivers/media/common/saa7146_vv.ko] undefined! Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel/paravirt.c') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6aa8aed06d5..3edfd7af22a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -472,6 +472,7 @@ struct pv_lock_ops pv_lock_ops = { .spin_unlock = __ticket_spin_unlock, #endif }; +EXPORT_SYMBOL_GPL(pv_lock_ops); EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); -- cgit v1.2.3-70-g09d2