From bebfa1013eee1d91b3242e5801cc8fbdfaf148ec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 26 Jun 2006 13:56:52 +0200 Subject: [PATCH] x86_64: Add compat_printk and sysctl to turn off compat layer warnings Sometimes e.g. with crashme the compat layer warnings can be noisy. Add a way to turn them off by gating all output through compat_printk that checks a global sysctl. The default is not changed. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2c0e6581944..f1a4eb1a655 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -73,6 +73,7 @@ extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; +extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -676,6 +677,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif +#ifdef CONFIG_COMPAT + { + .ctl_name = KERN_COMPAT_LOG, + .procname = "compat-log", + .data = &compat_log, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = 0 } }; -- cgit v1.2.3-70-g09d2 From e6e5494cb23d1933735ee47cc674ffe1c4afed6f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:53:50 -0700 Subject: [PATCH] vdso: randomize the i386 vDSO by moving it into a vma Move the i386 VDSO down into a vma and thus randomize it. Besides the security implications, this feature also helps debuggers, which can COW a vma-backed VDSO just like a normal DSO and can thus do single-stepping and other debugging features. It's good for hypervisors (Xen, VMWare) too, which typically live in the same high-mapped address space as the VDSO, hence whenever the VDSO is used, they get lots of guest pagefaults and have to fix such guest accesses up - which slows things down instead of speeding things up (the primary purpose of the VDSO). There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support for older glibcs that still rely on a prelinked high-mapped VDSO. Newer distributions (using glibc 2.3.3 or later) can turn this option off. Turning it off is also recommended for security reasons: attackers cannot use the predictable high-mapped VDSO page as syscall trampoline anymore. There is a new vdso=[0|1] boot option as well, and a runtime /proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned on/off. (This version of the VDSO-randomization patch also has working ELF coredumping, the previous patch crashed in the coredumping code.) This code is a combined work of the exec-shield VDSO randomization code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell started this patch and i completed it. [akpm@osdl.org: cleanups] [akpm@osdl.org: compile fix] [akpm@osdl.org: compile fix 2] [akpm@osdl.org: compile fix 3] [akpm@osdl.org: revernt MAXMEM change] Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Cc: Gerd Hoffmann Cc: Rusty Russell Cc: Zachary Amsden Cc: Andi Kleen Cc: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 4 ++ arch/i386/Kconfig | 11 +++ arch/i386/kernel/asm-offsets.c | 4 +- arch/i386/kernel/entry.S | 7 +- arch/i386/kernel/signal.c | 4 +- arch/i386/kernel/sysenter.c | 128 +++++++++++++++++++++++++++++++++-- arch/i386/kernel/vsyscall-sysenter.S | 4 +- arch/i386/kernel/vsyscall.lds.S | 4 +- fs/proc/task_mmu.c | 30 ++++---- include/asm-i386/elf.h | 53 +++++++++++---- include/asm-i386/fixmap.h | 10 +-- include/asm-i386/mmu.h | 1 + include/asm-i386/page.h | 3 + include/asm-i386/thread_info.h | 1 + include/asm-i386/unwind.h | 4 +- include/linux/mm.h | 2 + include/linux/sysctl.h | 1 + kernel/sysctl.c | 12 ++++ 18 files changed, 235 insertions(+), 48 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2e352a605fc..0d189c93eea 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1669,6 +1669,10 @@ running once the system is up. usbhid.mousepoll= [USBHID] The interval which mice are to be polled at. + vdso= [IA-32] + vdso=1: enable VDSO (default) + vdso=0: disable VDSO mapping + video= [FB] Frame buffer configuration See Documentation/fb/modedb.txt. diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 6662f8c4479..3bb221db164 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -780,6 +780,17 @@ config HOTPLUG_CPU enable suspend on SMP systems. CPUs can be controlled through /sys/devices/system/cpu. +config COMPAT_VDSO + bool "Compat VDSO support" + default y + help + Map the VDSO to the predictable old-style address too. + ---help--- + Say N here if you are running a sufficiently recent glibc + version (2.3.3 or later), to remove the high-mapped + VDSO mapping and to exclusively use the randomized VDSO. + + If unsure, say Y. endmenu diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 1c3a809e642..c80271f8f08 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -14,6 +14,7 @@ #include #include #include +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -54,6 +55,7 @@ void foo(void) OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); BLANK(); OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); @@ -69,7 +71,7 @@ void foo(void) sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL)); + DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index e8d2630fd19..fbdb933251b 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -270,7 +270,12 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ - pushl $SYSENTER_RETURN + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 5c352c3a9e7..43002cfb40c 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; } - restorer = &__kernel_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up to return from userspace. */ - restorer = &__kernel_rt_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 0bada1870bd..c60419dee01 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -2,6 +2,8 @@ * linux/arch/i386/kernel/sysenter.c * * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar * * This file contains the needed initializations to support sysenter. */ @@ -13,12 +15,31 @@ #include #include #include +#include +#include #include #include #include #include +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso_enabled = 1; + +EXPORT_SYMBOL_GPL(vdso_enabled); + +static int __init vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + + return 1; +} + +__setup("vdso=", vdso_setup); + extern asmlinkage void sysenter_entry(void); void enable_sep_cpu(void) @@ -45,23 +66,122 @@ void enable_sep_cpu(void) */ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; +static void *syscall_page; int __init sysenter_setup(void) { - void *page = (void *)get_zeroed_page(GFP_ATOMIC); + syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); - __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC); +#ifdef CONFIG_COMPAT_VDSO + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); +#else + /* + * In the non-compat case the ELF coredumping code needs the fixmap: + */ + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO); +#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(page, + memcpy(syscall_page, &vsyscall_int80_start, &vsyscall_int80_end - &vsyscall_int80_start); return 0; } - memcpy(page, + memcpy(syscall_page, &vsyscall_sysenter_start, &vsyscall_sysenter_end - &vsyscall_sysenter_start); return 0; } + +static struct page *syscall_nopage(struct vm_area_struct *vma, + unsigned long adr, int *type) +{ + struct page *p = virt_to_page(adr - vma->vm_start + syscall_page); + get_page(p); + return p; +} + +/* Prevent VMA merging */ +static void syscall_vma_close(struct vm_area_struct *vma) +{ +} + +static struct vm_operations_struct syscall_vm_ops = { + .close = syscall_vma_close, + .nopage = syscall_nopage, +}; + +/* Defined in vsyscall-sysenter.S */ +extern void SYSENTER_RETURN; + +/* Setup a VMA at program startup for the vsyscall page */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + + down_write(&mm->mmap_sem); + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + ret = -ENOMEM; + goto up_fail; + } + + vma->vm_start = addr; + vma->vm_end = addr + PAGE_SIZE; + /* MAYWRITE to allow gdb to COW and set breakpoints */ + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; + vma->vm_flags |= mm->def_flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_ops = &syscall_vm_ops; + vma->vm_mm = mm; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto free_vma; + + current->mm->context.vdso = (void *)addr; + current_thread_info()->sysenter_return = + (void *)VDSO_SYM(&SYSENTER_RETURN); + mm->total_vm++; +up_fail: + up_write(&mm->mmap_sem); + return ret; + +free_vma: + kmem_cache_free(vm_area_cachep, vma); + return ret; +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ + return NULL; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + return 0; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S index 3b62baa6a37..1a36d26e15e 100644 --- a/arch/i386/kernel/vsyscall-sysenter.S +++ b/arch/i386/kernel/vsyscall-sysenter.S @@ -42,10 +42,10 @@ __kernel_vsyscall: /* 7: align return point with nop's to make disassembly easier */ .space 7,0x90 - /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */ + /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel /* 16: System call normal return point is here! */ - .globl SYSENTER_RETURN /* Symbol used by entry.S. */ + .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ SYSENTER_RETURN: pop %ebp .Lpop_ebp: diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S index 98699ca6e52..e26975fc68b 100644 --- a/arch/i386/kernel/vsyscall.lds.S +++ b/arch/i386/kernel/vsyscall.lds.S @@ -7,7 +7,7 @@ SECTIONS { - . = VSYSCALL_BASE + SIZEOF_HEADERS; + . = VDSO_PRELINK + SIZEOF_HEADERS; .hash : { *(.hash) } :text .dynsym : { *(.dynsym) } @@ -20,7 +20,7 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; + . = VDSO_PRELINK + 0x400; .text : { *(.text) } :text =0x90909090 .note : { *(.note.*) } :text :note diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0137ec4c136..0a163a4f776 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -122,6 +122,11 @@ struct mem_size_stats unsigned long private_dirty; }; +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { struct proc_maps_private *priv = m->private; @@ -158,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats pad_len_spaces(m, len); seq_path(m, file->f_vfsmnt, file->f_dentry, "\n"); } else { - if (mm) { - if (vma->vm_start <= mm->start_brk && + const char *name = arch_vma_name(vma); + if (!name) { + if (mm) { + if (vma->vm_start <= mm->start_brk && vma->vm_end >= mm->brk) { - pad_len_spaces(m, len); - seq_puts(m, "[heap]"); - } else { - if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - - pad_len_spaces(m, len); - seq_puts(m, "[stack]"); + name = "[heap]"; + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + name = "[stack]"; } + } else { + name = "[vdso]"; } - } else { + } + if (name) { pad_len_spaces(m, len); - seq_puts(m, "[vdso]"); + seq_puts(m, name); } } seq_putc(m, '\n'); diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h index 4153d80e4d2..1eac92cb5b1 100644 --- a/include/asm-i386/elf.h +++ b/include/asm-i386/elf.h @@ -10,6 +10,7 @@ #include #include /* for savesegment */ #include +#include #include @@ -129,15 +130,41 @@ extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs) #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs) -#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) -#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) -#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) +#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) +#define VDSO_BASE ((unsigned long)current->mm->context.vdso) + +#ifdef CONFIG_COMPAT_VDSO +# define VDSO_COMPAT_BASE VDSO_HIGH_BASE +# define VDSO_PRELINK VDSO_HIGH_BASE +#else +# define VDSO_COMPAT_BASE VDSO_BASE +# define VDSO_PRELINK 0 +#endif + +#define VDSO_COMPAT_SYM(x) \ + (VDSO_COMPAT_BASE + (unsigned long)(x) - VDSO_PRELINK) + +#define VDSO_SYM(x) \ + (VDSO_BASE + (unsigned long)(x) - VDSO_PRELINK) + +#define VDSO_HIGH_EHDR ((const struct elfhdr *) VDSO_HIGH_BASE) +#define VDSO_EHDR ((const struct elfhdr *) VDSO_COMPAT_BASE) + extern void __kernel_vsyscall; +#define VDSO_ENTRY VDSO_SYM(&__kernel_vsyscall) + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES +struct linux_binprm; +extern int arch_setup_additional_pages(struct linux_binprm *bprm, + int executable_stack); + +extern unsigned int vdso_enabled; + #define ARCH_DLINFO \ -do { \ - NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ +do if (vdso_enabled) { \ + NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE); \ } while (0) /* @@ -148,15 +175,15 @@ do { \ * Dumping its extra ELF program headers includes all the other information * a debugger needs to easily find how the vsyscall DSO was being used. */ -#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) +#define ELF_CORE_EXTRA_PHDRS (VDSO_HIGH_EHDR->e_phnum) #define ELF_CORE_WRITE_EXTRA_PHDRS \ do { \ const struct elf_phdr *const vsyscall_phdrs = \ - (const struct elf_phdr *) (VSYSCALL_BASE \ - + VSYSCALL_EHDR->e_phoff); \ + (const struct elf_phdr *) (VDSO_HIGH_BASE \ + + VDSO_HIGH_EHDR->e_phoff); \ int i; \ Elf32_Off ofs = 0; \ - for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ + for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) { \ struct elf_phdr phdr = vsyscall_phdrs[i]; \ if (phdr.p_type == PT_LOAD) { \ BUG_ON(ofs != 0); \ @@ -174,10 +201,10 @@ do { \ #define ELF_CORE_WRITE_EXTRA_DATA \ do { \ const struct elf_phdr *const vsyscall_phdrs = \ - (const struct elf_phdr *) (VSYSCALL_BASE \ - + VSYSCALL_EHDR->e_phoff); \ + (const struct elf_phdr *) (VDSO_HIGH_BASE \ + + VDSO_HIGH_EHDR->e_phoff); \ int i; \ - for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ + for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) { \ if (vsyscall_phdrs[i].p_type == PT_LOAD) \ DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \ PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h index f7e068f4d2f..a48cc3f7ccc 100644 --- a/include/asm-i386/fixmap.h +++ b/include/asm-i386/fixmap.h @@ -51,7 +51,7 @@ */ enum fixed_addresses { FIX_HOLE, - FIX_VSYSCALL, + FIX_VDSO, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -115,14 +115,6 @@ extern void __set_fixmap (enum fixed_addresses idx, #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) -/* - * This is the range that is readable by user mode, and things - * acting like user mode such as get_user_pages. - */ -#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL)) -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) - - extern void __this_fixmap_does_not_exist(void); /* diff --git a/include/asm-i386/mmu.h b/include/asm-i386/mmu.h index f431a0b86d4..8358dd3df7a 100644 --- a/include/asm-i386/mmu.h +++ b/include/asm-i386/mmu.h @@ -12,6 +12,7 @@ typedef struct { int size; struct semaphore sem; void *ldt; + void *vdso; } mm_context_t; #endif diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index e3a552fa553..f5bf544c729 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -96,6 +96,8 @@ typedef struct { unsigned long pgprot; } pgprot_t; #ifndef __ASSEMBLY__ +struct vm_area_struct; + /* * This much address space is reserved for vmalloc() and iomap() * as well as fixmap mappings. @@ -139,6 +141,7 @@ extern int page_is_ram(unsigned long pagenr); #include #include +#define __HAVE_ARCH_GATE_AREA 1 #endif /* __KERNEL__ */ #endif /* _I386_PAGE_H */ diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h index ff1e2b1a7c8..2833fa2c0dd 100644 --- a/include/asm-i386/thread_info.h +++ b/include/asm-i386/thread_info.h @@ -37,6 +37,7 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + void *sysenter_return; struct restart_block restart_block; unsigned long previous_esp; /* ESP of the previous stack in case diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index d480f2e3821..69f0f1df672 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -78,8 +78,8 @@ static inline int arch_unw_user_mode(const struct unwind_frame_info *info) return user_mode_vm(&info->regs); #else return info->regs.eip < PAGE_OFFSET - || (info->regs.eip >= __fix_to_virt(FIX_VSYSCALL) - && info->regs.eip < __fix_to_virt(FIX_VSYSCALL) + PAGE_SIZE) + || (info->regs.eip >= __fix_to_virt(FIX_VDSO) + && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE) || info->regs.esp < PAGE_OFFSET; #endif } diff --git a/include/linux/mm.h b/include/linux/mm.h index a929ea197e4..ff1fa87df8d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1065,5 +1065,7 @@ void drop_slab(void); extern int randomize_va_space; #endif +const char *arch_vma_name(struct vm_area_struct *vma); + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 349ef908a22..bee12a7a057 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -189,6 +189,7 @@ enum VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ + VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f1a4eb1a655..f54afed8426 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -927,6 +927,18 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies, }, +#endif +#ifdef CONFIG_X86_32 + { + .ctl_name = VM_VDSO_ENABLED, + .procname = "vdso_enabled", + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #endif { .ctl_name = 0 } }; -- cgit v1.2.3-70-g09d2 From 23f78d4a03c53cbd75d87a795378ea540aa08c86 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:53 -0700 Subject: [PATCH] pi-futex: rt mutex core Core functions for the rt-mutex subsystem. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 1 + include/linux/rtmutex.h | 104 ++++++ include/linux/sched.h | 12 + include/linux/sysctl.h | 1 + init/Kconfig | 5 + kernel/Makefile | 1 + kernel/fork.c | 16 + kernel/rtmutex.c | 904 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rtmutex.h | 29 ++ kernel/rtmutex_common.h | 93 +++++ kernel/sysctl.c | 15 + 11 files changed, 1181 insertions(+) create mode 100644 include/linux/rtmutex.h create mode 100644 kernel/rtmutex.c create mode 100644 kernel/rtmutex.h create mode 100644 kernel/rtmutex_common.h (limited to 'kernel/sysctl.c') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 678c1a90380..3a256957fb5 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -124,6 +124,7 @@ extern struct group_info init_groups; .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ .pi_lock = SPIN_LOCK_UNLOCKED, \ + INIT_RT_MUTEXES(tsk) \ } diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h new file mode 100644 index 00000000000..12309c916c6 --- /dev/null +++ b/include/linux/rtmutex.h @@ -0,0 +1,104 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains the public data structure and API definitions. + */ + +#ifndef __LINUX_RT_MUTEX_H +#define __LINUX_RT_MUTEX_H + +#include +#include +#include + +/* + * The rt_mutex structure + * + * @wait_lock: spinlock to protect the structure + * @wait_list: pilist head to enqueue waiters in priority order + * @owner: the mutex owner + */ +struct rt_mutex { + spinlock_t wait_lock; + struct plist_head wait_list; + struct task_struct *owner; +#ifdef CONFIG_DEBUG_RT_MUTEXES + int save_state; + struct list_head held_list_entry; + unsigned long acquire_ip; + const char *name, *file; + int line; + void *magic; +#endif +}; + +struct rt_mutex_waiter; +struct hrtimer_sleeper; + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ + , .name = #mutexname, .file = __FILE__, .line = __LINE__ +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __FUNCTION__) + extern void rt_mutex_debug_task_free(struct task_struct *tsk); +#else +# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL) +# define rt_mutex_debug_task_free(t) do { } while (0) +#endif + +#define __RT_MUTEX_INITIALIZER(mutexname) \ + { .wait_lock = SPIN_LOCK_UNLOCKED \ + , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ + , .owner = NULL \ + __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} + +#define DEFINE_RT_MUTEX(mutexname) \ + struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) + +/*** + * rt_mutex_is_locked - is the mutex locked + * @lock: the mutex to be queried + * + * Returns 1 if the mutex is locked, 0 if unlocked. + */ +static inline int rt_mutex_is_locked(struct rt_mutex *lock) +{ + return lock->owner != NULL; +} + +extern void __rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void rt_mutex_destroy(struct rt_mutex *lock); + +extern void rt_mutex_lock(struct rt_mutex *lock); +extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock); +extern int rt_mutex_timed_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout, + int detect_deadlock); + +extern int rt_mutex_trylock(struct rt_mutex *lock); + +extern void rt_mutex_unlock(struct rt_mutex *lock); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define INIT_RT_MUTEX_DEBUG(tsk) \ + .held_list_head = LIST_HEAD_INIT(tsk.held_list_head), \ + .held_list_lock = SPIN_LOCK_UNLOCKED +#else +# define INIT_RT_MUTEX_DEBUG(tsk) +#endif + +#ifdef CONFIG_RT_MUTEXES +# define INIT_RT_MUTEXES(tsk) \ + .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ + INIT_RT_MUTEX_DEBUG(tsk) +#else +# define INIT_RT_MUTEXES(tsk) +#endif + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f167645e7e..6ea23c9af41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -73,6 +73,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -858,6 +859,17 @@ struct task_struct { /* Protection of the PI data structures: */ spinlock_t pi_lock; +#ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task */ + struct plist_head pi_waiters; + /* Deadlock detection and priority inheritance handling */ + struct rt_mutex_waiter *pi_blocked_on; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spinlock_t held_list_lock; + struct list_head held_list_head; +# endif +#endif + #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bee12a7a057..46e4d8f2771 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -149,6 +149,7 @@ enum KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ + KERN_MAX_LOCK_DEPTH=74, }; diff --git a/init/Kconfig b/init/Kconfig index df55b366560..f70f2fd273c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -339,9 +339,14 @@ config BASE_FULL kernel data structures. This saves memory on small machines, but may reduce performance. +config RT_MUTEXES + boolean + select PLIST + config FUTEX bool "Enable futex support" if EMBEDDED default y + select RT_MUTEXES help Disabling this option will cause the kernel to be built without support for "fast userspace mutexes". The resulting kernel may not diff --git a/kernel/Makefile b/kernel/Makefile index 752bd7d383a..21df9a338ff 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o diff --git a/kernel/fork.c b/kernel/fork.c index 9b4e54ef022..b664a081fff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; void free_task(struct task_struct *tsk) { free_thread_info(tsk->thread_info); + rt_mutex_debug_task_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ +#ifdef CONFIG_RT_MUTEXES + spin_lock_init(&p->pi_lock); + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spin_lock_init(&p->held_list_lock); + INIT_LIST_HEAD(&p->held_list_head); +# endif +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags, mpol_fix_fork_child_flag(p); #endif + rt_mutex_init_task(p); + #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 00000000000..937a474fae9 --- /dev/null +++ b/kernel/rtmutex.c @@ -0,0 +1,904 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + */ +#include +#include +#include +#include + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 + * are used to keep track of the "owner is pending" and "lock has + * waiters" state. + * + * owner bit1 bit0 + * NULL 0 0 lock is free (fast acquire possible) + * NULL 0 1 invalid state + * NULL 1 0 Transitional State* + * NULL 1 1 invalid state + * taskpointer 0 0 lock is held (fast release possible) + * taskpointer 0 1 task is pending owner + * taskpointer 1 0 lock is held and has waiters + * taskpointer 1 1 task is pending owner and lock has more waiters + * + * Pending ownership is assigned to the top (highest priority) + * waiter of the lock, when the lock is released. The thread is woken + * up and can now take the lock. Until the lock is taken (bit 0 + * cleared) a competing higher priority thread can steal the lock + * which puts the woken up thread back on the waiters list. + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 and 1 of lock->owner are 0. + * + * (*) There's a small time where the owner can be NULL and the + * "lock has waiters" bit is set. This can happen when grabbing the lock. + * To prevent a cmpxchg of the owner releasing the lock, we need to set this + * bit before looking at the lock, hence the reason this is a transitional + * state. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask) +{ + unsigned long val = (unsigned long)owner | mask; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* + * Calculate task priority from the waiter list priority + * + * Return task->normal_prio when the waiter list is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return task->normal_prio; + + return min(task_top_pi_waiter(task)->pi_list_entry.prio, + task->normal_prio); +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ + int prio = rt_mutex_getprio(task); + + if (task->prio != prio) + rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(task_t *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter + __IP_DECL__) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + unsigned long flags; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); + + /* + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. + */ + again: + if (++depth > max_lock_depth) { + static int prev_max; + + /* + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. + */ + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + current->comm, current->pid); + } + put_task_struct(task); + + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter || !waiter->task) + goto out_unlock_pi; + + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; + + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) + goto out_unlock_pi; + + lock = waiter->lock; + if (!spin_trylock(&lock->wait_lock)) { + spin_unlock_irqrestore(&task->pi_lock, flags); + cpu_relax(); + goto retry; + } + + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == current) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } + + top_waiter = rt_mutex_top_waiter(lock); + + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ + spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + spin_lock_irqsave(&task->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } + + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + top_waiter = rt_mutex_top_waiter(lock); + spin_unlock(&lock->wait_lock); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; + + goto again; + + out_unlock_pi: + spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: + put_task_struct(task); + return ret; +} + +/* + * Optimization: check if we can steal the lock from the + * assigned pending owner [which might not have taken the + * lock yet]: + */ +static inline int try_to_steal_lock(struct rt_mutex *lock) +{ + struct task_struct *pendowner = rt_mutex_owner(lock); + struct rt_mutex_waiter *next; + unsigned long flags; + + if (!rt_mutex_owner_pending(lock)) + return 0; + + if (pendowner == current) + return 1; + + spin_lock_irqsave(&pendowner->pi_lock, flags); + if (current->prio >= pendowner->prio) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 0; + } + + /* + * Check if a waiter is enqueued on the pending owners + * pi_waiters list. Remove it and readjust pending owners + * priority. + */ + if (likely(!rt_mutex_has_waiters(lock))) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 1; + } + + /* No chain handling, pending owner is not blocked on anything: */ + next = rt_mutex_top_waiter(lock); + plist_del(&next->pi_list_entry, &pendowner->pi_waiters); + __rt_mutex_adjust_prio(pendowner); + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + /* + * We are going to steal the lock and a waiter was + * enqueued on the pending owners pi_waiters queue. So + * we have to enqueue this waiter into + * current->pi_waiters list. This covers the case, + * where current is boosted because it holds another + * lock and gets unboosted because the booster is + * interrupted, so we would delay a waiter with higher + * priority as current->normal_prio. + * + * Note: in the rare case of a SCHED_OTHER task changing + * its priority and thus stealing the lock, next->task + * might be current: + */ + if (likely(next->task != current)) { + spin_lock_irqsave(¤t->pi_lock, flags); + plist_add(&next->pi_list_entry, ¤t->pi_waiters); + __rt_mutex_adjust_prio(current); + spin_unlock_irqrestore(¤t->pi_lock, flags); + } + return 1; +} + +/* + * Try to take an rt-mutex + * + * This fails + * - when the lock has a real owner + * - when a different pending owner exists and has higher priority than current + * + * Must be called with lock->wait_lock held. + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) +{ + /* + * We have to be careful here if the atomic speedups are + * enabled, such that, when + * - no other waiter is on the lock + * - the lock has been released since we did the cmpxchg + * the lock can be released or taken while we are doing the + * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * + * The atomic acquire/release aware variant of + * mark_rt_mutex_waiters uses a cmpxchg loop. After setting + * the WAITERS bit, the atomic release / acquire can not + * happen anymore and lock->wait_lock protects us from the + * non-atomic case. + * + * Note, that this might set lock->owner = + * RT_MUTEX_HAS_WAITERS in the case the lock is not contended + * any more. This is fixed up when we take the ownership. + * This is the transitional state explained at the top of this file. + */ + mark_rt_mutex_waiters(lock); + + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + return 0; + + /* We got the lock. */ + debug_rt_mutex_lock(lock __IP__); + + rt_mutex_set_owner(lock, current, 0); + + rt_mutex_deadlock_account_lock(lock, current); + + return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + int detect_deadlock + __IP_DECL__) +{ + struct rt_mutex_waiter *top_waiter = waiter; + task_t *owner = rt_mutex_owner(lock); + int boost = 0, res; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + __rt_mutex_adjust_prio(current); + waiter->task = current; + waiter->lock = lock; + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + if (!boost) + return 0; + + spin_unlock(&lock->wait_lock); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + waiter __IP__); + + spin_lock(&lock->wait_lock); + + return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks waiter list and from + * the lock waiter list. Set it as pending owner. Then wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + + waiter = rt_mutex_top_waiter(lock); + plist_del(&waiter->list_entry, &lock->wait_list); + + /* + * Remove it from current->pi_waiters. We do not adjust a + * possible priority boost right now. We execute wakeup in the + * boosted mode and go back to normal after releasing + * lock->wait_lock. + */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + pendowner = waiter->task; + waiter->task = NULL; + + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + /* + * Clear the pi_blocked_on variable and enqueue a possible + * waiter into the pi_waiters list of the pending owner. This + * prevents that in case the pending owner gets unboosted a + * waiter with higher priority than pending-owner->normal_prio + * is blocked on the unboosted (pending) owner. + */ + spin_lock_irqsave(&pendowner->pi_lock, flags); + + WARN_ON(!pendowner->pi_blocked_on); + WARN_ON(pendowner->pi_blocked_on != waiter); + WARN_ON(pendowner->pi_blocked_on->lock != lock); + + pendowner->pi_blocked_on = NULL; + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &pendowner->pi_waiters); + } + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + wake_up_process(pendowner); +} + +/* + * Remove a waiter from a lock + * + * Must be called with lock->wait_lock held + */ +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter __IP_DECL__) +{ + int first = (waiter == rt_mutex_top_waiter(lock)); + int boost = 0; + task_t *owner = rt_mutex_owner(lock); + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->task = NULL; + current->pi_blocked_on = NULL; + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (first && owner != current) { + + spin_lock_irqsave(&owner->pi_lock, flags); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); + + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + + if (!boost) + return; + + spin_unlock(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__); + + spin_lock(&lock->wait_lock); +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock __IP__)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) + hrtimer_start(&timeout->timer, timeout->timer.expires, + HRTIMER_ABS); + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock __IP__)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + ret = task_blocks_on_rt_mutex(lock, &waiter, + detect_deadlock __IP__); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter.task)) + continue; + + if (unlikely(ret)) + break; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(&waiter); + + schedule(); + + spin_lock(&lock->wait_lock); + set_current_state(state); + } + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter __IP__); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + /* + * Readjust priority, when we did not get the lock. We might + * have been the pending owner and boosted. Since we did not + * take the lock, the PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) +{ + int ret = 0; + + spin_lock(&lock->wait_lock); + + if (likely(rt_mutex_owner(lock) != current)) { + + ret = try_to_take_rt_mutex(lock __IP__); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + spin_unlock(&lock->wait_lock); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + spin_lock(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock(&lock->wait_lock); + return; + } + + wakeup_next_waiter(lock); + + spin_unlock(&lock->wait_lock); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 1; + } + return slowfn(lock __RET_IP__); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + might_sleep(); + + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller + * + * @lock: the rt_mutex to be locked + * @timeout: timeout structure or NULL (no timeout) + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -ETIMEOUT when the timeout expired + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/*** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ + WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES + lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + plist_head_init(&lock->wait_list, &lock->wait_lock); + + debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 00000000000..1e0fca13ff7 --- /dev/null +++ b/kernel/rtmutex.h @@ -0,0 +1,29 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define __IP_DECL__ +#define __IP__ +#define __RET_IP__ +#define rt_mutex_deadlock_check(l) (0) +#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +#define rt_mutex_deadlock_account_unlock(l) do { } while (0) +#define debug_rt_mutex_init_waiter(w) do { } while (0) +#define debug_rt_mutex_free_waiter(w) do { } while (0) +#define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) +#define debug_rt_mutex_proxy_unlock(l) do { } while (0) +#define debug_rt_mutex_unlock(l) do { } while (0) +#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +#define debug_rt_mutex_print_deadlock(w) do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d) (d) +#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 00000000000..50eed60eb08 --- /dev/null +++ b/kernel/rtmutex_common.h @@ -0,0 +1,93 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + pid_t deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_OWNER_PENDING 1UL +#define RT_MUTEX_HAS_WAITERS 2UL +#define RT_MUTEX_OWNER_MASKALL 3UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) +{ + return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; +} + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f54afed8426..93a2c539864 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -133,6 +133,10 @@ extern int acct_parm[]; extern int no_unaligned_warning; #endif +#ifdef CONFIG_RT_MUTEXES +extern int max_lock_depth; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -688,6 +692,17 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_RT_MUTEXES + { + .ctl_name = KERN_MAX_LOCK_DEPTH, + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } }; -- cgit v1.2.3-70-g09d2